embedprobe 0.0.0__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+
7
+ jobs:
8
+ test:
9
+ runs-on: ubuntu-latest
10
+ strategy:
11
+ fail-fast: false
12
+ matrix:
13
+ python-version: ["3.10", "3.12"]
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - uses: actions/setup-python@v5
17
+ with:
18
+ python-version: ${{ matrix.python-version }}
19
+ - name: Install package
20
+ run: python -m pip install -e ".[dev]"
21
+ - name: Run tests
22
+ run: pytest
@@ -0,0 +1,12 @@
1
+ context
2
+ CLAUDE.md
3
+ .venv/
4
+ dist/
5
+ build/
6
+ *.egg-info/
7
+ .pytest_cache/
8
+ __pycache__/
9
+ *.py[cod]
10
+ .claude
11
+ .agents
12
+
@@ -0,0 +1,105 @@
1
+ Metadata-Version: 2.4
2
+ Name: embedprobe
3
+ Version: 0.1.0
4
+ Summary: A diagnostic toolkit for evaluating and selecting multilingual embedding models on your data.
5
+ Project-URL: Homepage, https://github.com/Sainath26/embedprobe
6
+ Author-email: Harish Sainath S <harishsainth036@gmail.com>
7
+ License: MIT
8
+ Keywords: diagnostics,embeddings,evaluation,model-selection,nlp,transformers
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Requires-Python: >=3.10
15
+ Requires-Dist: matplotlib>=3.6
16
+ Requires-Dist: numpy>=1.22
17
+ Requires-Dist: pandas>=1.5
18
+ Requires-Dist: rich>=13
19
+ Requires-Dist: scikit-learn>=1.1
20
+ Requires-Dist: scipy>=1.9
21
+ Requires-Dist: sentence-transformers>=2.2
22
+ Requires-Dist: typer>=0.9
23
+ Requires-Dist: umap-learn>=0.5
24
+ Provides-Extra: dev
25
+ Requires-Dist: build; extra == 'dev'
26
+ Requires-Dist: pytest>=7; extra == 'dev'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # embedprobe
30
+
31
+ **A diagnostic toolkit for evaluating and selecting multilingual embedding models on _your_ data.**
32
+
33
+ Leaderboards like [MTEB](https://github.com/embeddings-benchmark/mteb) tell you _which_ embedding
34
+ model ranks higher on average. **embedprobe tells you _why_ a model fails on your domain and
35
+ language pair,** so you can pick a compact encoder for your task without fine-tuning every
36
+ candidate.
37
+
38
+ Given a parallel dataset (source text, target text, topic), embedprobe dissects each candidate
39
+ model across four diagnostic levels:
40
+
41
+ | Level | Question it answers | Signals |
42
+ | ------------------------ | ----------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------- |
43
+ | **0 — Signal vs. noise** | Does the model separate true pairs from random ones at all? | true-vs-random cosine distributions, SNR, Kolmogorov–Smirnov test |
44
+ | **1 — Retrieval** | How reliably does it retrieve the true counterpart? | Recall@k, Precision@k, MRR, CMC curve |
45
+ | **2 — Topic structure** | Is the space organized by meaning or leaking across topics? | retrieval-based topic confusion, topic-pair average cosine, UMAP projections |
46
+ | **3 — Error taxonomy** | _Why_ do retrievals miss? | Jaccard-based categorization of misses into **lexical confusion**, **semantic confusion**, and **topic-boundary fuzziness** |
47
+
48
+ The Level 3 error taxonomy is the headline: instead of a single aggregate score, each retrieval
49
+ miss is classified by token-overlap between the retrieved and the true target, telling you whether
50
+ a model is being fooled by surface overlap, drifting semantically, or blurring topic boundaries.
51
+
52
+ ## Install
53
+
54
+ ```bash
55
+ pip install embedprobe
56
+ ```
57
+
58
+ UMAP projection support is included for Level 2 visual diagnostics.
59
+
60
+ ## Quickstart
61
+
62
+ ```python
63
+ import pandas as pd
64
+ from embedprobe import probe
65
+
66
+ # parallel data: one row per pair, plus a topic column
67
+ df = pd.read_csv("pairs.csv") # columns: en, es, topic
68
+
69
+ report = probe(
70
+ models=["sentence-transformers/LaBSE",
71
+ "sentence-transformers/distiluse-base-multilingual-cased-v2"],
72
+ data=df,
73
+ src_col="en",
74
+ tgt_col="es",
75
+ seed=42,
76
+ )
77
+
78
+ report.summary() # cross-model DataFrame of all metrics
79
+ report.to_json("report.json")
80
+ report.to_html("report.html") # self-contained diagnostic dashboard
81
+ ```
82
+
83
+ Or from the command line:
84
+
85
+ ```bash
86
+ embedprobe run --models sentence-transformers/LaBSE --data pairs.csv \
87
+ --src en --tgt es --out report
88
+ ```
89
+
90
+ ## Status
91
+
92
+ Pre-release (`0.x`). The toolkit originates from an MSc dissertation study of 21
93
+ sentence-transformer models across EN–ES, EN–FR and EN–ZH parallel data; the packaged version is
94
+ being hardened for a 2026 workshop submission. APIs may change until `1.0`.
95
+
96
+ ## Roadmap
97
+
98
+ - [ ] Empirical validation of the Jaccard taxonomy thresholds
99
+ - [ ] Selection-prediction study: do these diagnostics predict downstream task ranking?
100
+ - [ ] MTEB adapter: shortlist from the leaderboard → diagnose on your data
101
+ - [ ] Decoder-only LLM support (MEXA-style alignment probing)
102
+
103
+ ## License
104
+
105
+ MIT
@@ -0,0 +1,77 @@
1
+ # embedprobe
2
+
3
+ **A diagnostic toolkit for evaluating and selecting multilingual embedding models on _your_ data.**
4
+
5
+ Leaderboards like [MTEB](https://github.com/embeddings-benchmark/mteb) tell you _which_ embedding
6
+ model ranks higher on average. **embedprobe tells you _why_ a model fails on your domain and
7
+ language pair,** so you can pick a compact encoder for your task without fine-tuning every
8
+ candidate.
9
+
10
+ Given a parallel dataset (source text, target text, topic), embedprobe dissects each candidate
11
+ model across four diagnostic levels:
12
+
13
+ | Level | Question it answers | Signals |
14
+ | ------------------------ | ----------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------- |
15
+ | **0 — Signal vs. noise** | Does the model separate true pairs from random ones at all? | true-vs-random cosine distributions, SNR, Kolmogorov–Smirnov test |
16
+ | **1 — Retrieval** | How reliably does it retrieve the true counterpart? | Recall@k, Precision@k, MRR, CMC curve |
17
+ | **2 — Topic structure** | Is the space organized by meaning or leaking across topics? | retrieval-based topic confusion, topic-pair average cosine, UMAP projections |
18
+ | **3 — Error taxonomy** | _Why_ do retrievals miss? | Jaccard-based categorization of misses into **lexical confusion**, **semantic confusion**, and **topic-boundary fuzziness** |
19
+
20
+ The Level 3 error taxonomy is the headline: instead of a single aggregate score, each retrieval
21
+ miss is classified by token-overlap between the retrieved and the true target, telling you whether
22
+ a model is being fooled by surface overlap, drifting semantically, or blurring topic boundaries.
23
+
24
+ ## Install
25
+
26
+ ```bash
27
+ pip install embedprobe
28
+ ```
29
+
30
+ UMAP projection support is included for Level 2 visual diagnostics.
31
+
32
+ ## Quickstart
33
+
34
+ ```python
35
+ import pandas as pd
36
+ from embedprobe import probe
37
+
38
+ # parallel data: one row per pair, plus a topic column
39
+ df = pd.read_csv("pairs.csv") # columns: en, es, topic
40
+
41
+ report = probe(
42
+ models=["sentence-transformers/LaBSE",
43
+ "sentence-transformers/distiluse-base-multilingual-cased-v2"],
44
+ data=df,
45
+ src_col="en",
46
+ tgt_col="es",
47
+ seed=42,
48
+ )
49
+
50
+ report.summary() # cross-model DataFrame of all metrics
51
+ report.to_json("report.json")
52
+ report.to_html("report.html") # self-contained diagnostic dashboard
53
+ ```
54
+
55
+ Or from the command line:
56
+
57
+ ```bash
58
+ embedprobe run --models sentence-transformers/LaBSE --data pairs.csv \
59
+ --src en --tgt es --out report
60
+ ```
61
+
62
+ ## Status
63
+
64
+ Pre-release (`0.x`). The toolkit originates from an MSc dissertation study of 21
65
+ sentence-transformer models across EN–ES, EN–FR and EN–ZH parallel data; the packaged version is
66
+ being hardened for a 2026 workshop submission. APIs may change until `1.0`.
67
+
68
+ ## Roadmap
69
+
70
+ - [ ] Empirical validation of the Jaccard taxonomy thresholds
71
+ - [ ] Selection-prediction study: do these diagnostics predict downstream task ranking?
72
+ - [ ] MTEB adapter: shortlist from the leaderboard → diagnose on your data
73
+ - [ ] Decoder-only LLM support (MEXA-style alignment probing)
74
+
75
+ ## License
76
+
77
+ MIT
@@ -0,0 +1,45 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "embedprobe"
7
+ version = "0.1.0"
8
+ description = "A diagnostic toolkit for evaluating and selecting multilingual embedding models on your data."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Harish Sainath S", email = "harishsainth036@gmail.com" },
14
+ ]
15
+ keywords = ["embeddings", "nlp", "evaluation", "diagnostics", "transformers", "model-selection"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Science/Research",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
+ ]
23
+ dependencies = [
24
+ "numpy>=1.22",
25
+ "pandas>=1.5",
26
+ "scipy>=1.9",
27
+ "scikit-learn>=1.1",
28
+ "matplotlib>=3.6",
29
+ "sentence-transformers>=2.2",
30
+ "typer>=0.9",
31
+ "rich>=13",
32
+ "umap-learn>=0.5",
33
+ ]
34
+
35
+ [project.optional-dependencies]
36
+ dev = ["pytest>=7", "build"]
37
+
38
+ [project.scripts]
39
+ embedprobe = "embedprobe.cli:app"
40
+
41
+ [project.urls]
42
+ Homepage = "https://github.com/Sainath26/embedprobe"
43
+
44
+ [tool.hatch.build.targets.wheel]
45
+ packages = ["src/embedprobe"]
@@ -0,0 +1,19 @@
1
+ """embedprobe — a diagnostic toolkit for evaluating language-model embedding spaces."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+
6
+ def __getattr__(name):
7
+ # Lazy imports keep `import embedprobe` cheap and avoid circular imports.
8
+ if name == "probe":
9
+ from embedprobe.probe import probe
10
+
11
+ return probe
12
+ if name in ("ProbeReport", "ModelDiagnostics"):
13
+ from embedprobe import report
14
+
15
+ return getattr(report, name)
16
+ raise AttributeError(f"module 'embedprobe' has no attribute {name!r}")
17
+
18
+
19
+ __all__ = ["probe", "ProbeReport", "ModelDiagnostics", "__version__"]
@@ -0,0 +1,85 @@
1
+ """Command-line interface: ``embedprobe run`` and ``embedprobe compare``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import List, Optional
8
+
9
+ from rich.console import Console
10
+ import typer
11
+
12
+ app = typer.Typer(help="Diagnose multilingual embedding models on your own data.")
13
+ console = Console()
14
+
15
+
16
+ @app.command()
17
+ def run(
18
+ models: List[str] = typer.Option(..., "--models", "-m", help="Hub model name (repeatable)."),
19
+ data: Path = typer.Option(..., "--data", "-d", help="CSV/Parquet with parallel pairs."),
20
+ src_col: str = typer.Option(..., "--src", "--src-col", help="Source-language text column."),
21
+ tgt_col: str = typer.Option(..., "--tgt", "--tgt-col", help="Target-language text column."),
22
+ topic_col: str = typer.Option("topic", help="Topic column (Level 2 is skipped if absent)."),
23
+ out: Path = typer.Option(Path("embedprobe_report"), "--out", "-o", help="Output basename."),
24
+ seed: int = typer.Option(42, help="Random seed."),
25
+ batch_size: int = typer.Option(32, help="Encoding batch size."),
26
+ device: Optional[str] = typer.Option(None, help="Torch device, e.g. cuda."),
27
+ max_pairs: Optional[int] = typer.Option(None, help="Subsample the dataset to this many pairs."),
28
+ umap: bool = typer.Option(False, "--umap", help="Compute UMAP projections (needs umap-learn)."),
29
+ ):
30
+ """Run the four-level diagnostic and write <out>.json and <out>.html."""
31
+ from embedprobe.probe import probe
32
+
33
+ model_names = _normalize_models(models)
34
+ report = probe(
35
+ models=model_names, data=data, src_col=src_col, tgt_col=tgt_col, topic_col=topic_col,
36
+ seed=seed, batch_size=batch_size, device=device, max_pairs=max_pairs,
37
+ compute_umap=umap,
38
+ )
39
+ out.parent.mkdir(parents=True, exist_ok=True)
40
+ json_path, html_path = out.with_suffix(".json"), out.with_suffix(".html")
41
+ report.to_json(json_path)
42
+ report.to_html(html_path)
43
+ console.print(report.summary().round(4).to_string())
44
+ console.print(f"\n[green]Wrote[/green] {json_path} and {html_path}")
45
+
46
+
47
+ @app.command()
48
+ def compare(
49
+ reports: List[Path] = typer.Argument(..., help="embedprobe JSON reports to merge."),
50
+ out: Optional[Path] = typer.Option(None, "--out", "-o", help="Write merged HTML here."),
51
+ ):
52
+ """Merge previously saved JSON reports into one comparison."""
53
+ from embedprobe.report import ModelDiagnostics, ProbeReport
54
+
55
+ diagnostics, run_meta = [], {}
56
+ for path in reports:
57
+ payload = json.loads(path.read_text(encoding="utf-8"))
58
+ run_meta = run_meta or payload.get("run", {})
59
+ for entry in payload.get("models", []):
60
+ diagnostics.append(
61
+ ModelDiagnostics(
62
+ model_name=entry.get("model", path.stem),
63
+ meta=entry.get("meta", {}),
64
+ level0=entry.get("level0"),
65
+ level1=entry.get("level1"),
66
+ level2=entry.get("level2"),
67
+ level3=entry.get("level3"),
68
+ )
69
+ )
70
+ merged = ProbeReport(models=diagnostics, run_meta=run_meta)
71
+ console.print(merged.summary().round(4).to_string())
72
+ if out is not None:
73
+ merged.to_html(out)
74
+ console.print(f"\n[green]Wrote[/green] {out}")
75
+
76
+
77
+ def _normalize_models(models: List[str]) -> List[str]:
78
+ normalized = []
79
+ for item in models:
80
+ normalized.extend(part.strip() for part in item.split(",") if part.strip())
81
+ return normalized
82
+
83
+
84
+ if __name__ == "__main__":
85
+ app()
@@ -0,0 +1,67 @@
1
+ """Loading and validating parallel-pair datasets."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ from pathlib import Path
7
+ from typing import Optional, Union
8
+
9
+ import pandas as pd
10
+
11
+ TOPIC_COL = "topic"
12
+
13
+
14
+ def load_pairs(
15
+ source: Union[str, Path, pd.DataFrame],
16
+ src_col: str,
17
+ tgt_col: str,
18
+ topic_col: Optional[str] = TOPIC_COL,
19
+ max_pairs: Optional[int] = None,
20
+ seed: Optional[int] = None,
21
+ ) -> pd.DataFrame:
22
+ """Load a parallel dataset from a DataFrame, CSV or Parquet file.
23
+
24
+ Returns a DataFrame with the source, target and (if present) topic columns,
25
+ with empty/missing texts dropped and the index reset. When ``max_pairs`` is
26
+ given, rows are sampled reproducibly with ``seed``.
27
+ """
28
+ if isinstance(source, pd.DataFrame):
29
+ df = source.copy()
30
+ else:
31
+ path = Path(source)
32
+ if not path.exists():
33
+ raise FileNotFoundError(f"Dataset not found: {path}")
34
+ if path.suffix.lower() == ".parquet":
35
+ df = pd.read_parquet(path)
36
+ else:
37
+ df = pd.read_csv(path)
38
+
39
+ missing = [c for c in (src_col, tgt_col) if c not in df.columns]
40
+ if missing:
41
+ raise ValueError(
42
+ f"Dataset is missing required column(s) {missing}; found {list(df.columns)}"
43
+ )
44
+
45
+ cols = [src_col, tgt_col]
46
+ if topic_col is not None and topic_col in df.columns:
47
+ cols.append(topic_col)
48
+ df = df[cols]
49
+
50
+ for col in (src_col, tgt_col):
51
+ df = df[df[col].notna() & (df[col].astype(str).str.strip() != "")]
52
+
53
+ if max_pairs is not None and len(df) > max_pairs:
54
+ df = df.sample(n=max_pairs, random_state=seed)
55
+
56
+ return df.reset_index(drop=True)
57
+
58
+
59
+ def has_topics(df: pd.DataFrame, topic_col: str = TOPIC_COL) -> bool:
60
+ return topic_col in df.columns and df[topic_col].notna().any()
61
+
62
+
63
+ def dataset_hash(df: pd.DataFrame) -> str:
64
+ """Stable SHA-256 fingerprint of the loaded evaluation pairs."""
65
+ normalized = df.astype("string").fillna("")
66
+ payload = normalized.to_csv(index=False, lineterminator="\n").encode("utf-8")
67
+ return hashlib.sha256(payload).hexdigest()
@@ -0,0 +1,77 @@
1
+ """Embedding extraction and similarity computation.
2
+
3
+ Heavy dependencies (sentence-transformers / torch) are imported lazily so that
4
+ the pure-numpy diagnostics in :mod:`embedprobe.levels` stay usable without them.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import List, Optional, Sequence, Union
10
+
11
+ import numpy as np
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+
14
+
15
+ def load_model(model: Union[str, object]):
16
+ """Return a SentenceTransformer, loading it from the Hub if given a name."""
17
+ if isinstance(model, str):
18
+ from sentence_transformers import SentenceTransformer
19
+
20
+ return SentenceTransformer(model)
21
+ return model
22
+
23
+
24
+ def encode(
25
+ model,
26
+ texts: Sequence[str],
27
+ batch_size: int = 32,
28
+ device: Optional[str] = None,
29
+ show_progress: bool = False,
30
+ ) -> np.ndarray:
31
+ """Encode texts with a (loaded) SentenceTransformer into a 2-D array."""
32
+ return model.encode(
33
+ [str(t) for t in texts],
34
+ batch_size=batch_size,
35
+ device=device,
36
+ show_progress_bar=show_progress,
37
+ convert_to_numpy=True,
38
+ )
39
+
40
+
41
+ def pairwise_cosine(emb_a: np.ndarray, emb_b: np.ndarray) -> np.ndarray:
42
+ """Cosine-similarity matrix between two sets of embeddings."""
43
+ return cosine_similarity(emb_a, emb_b)
44
+
45
+
46
+ def model_info(model, fallback_name: str = "") -> dict:
47
+ """Best-effort architecture metadata for a SentenceTransformer."""
48
+ info = {
49
+ "name": getattr(model, "model_name_or_path", None) or fallback_name,
50
+ "hub_id": getattr(model, "model_name_or_path", None) or fallback_name,
51
+ "embedding_dim": None,
52
+ "arch": None,
53
+ "layers": None,
54
+ "vocab_size": None,
55
+ "params": None,
56
+ "license": None,
57
+ }
58
+ try:
59
+ info["embedding_dim"] = int(model.get_sentence_embedding_dimension())
60
+ except Exception:
61
+ pass
62
+ try:
63
+ cfg = model._first_module().auto_model.config
64
+ info["arch"] = getattr(cfg, "model_type", None)
65
+ for key in ("num_hidden_layers", "num_layers", "n_layer"):
66
+ if hasattr(cfg, key):
67
+ info["layers"] = int(getattr(cfg, key))
68
+ break
69
+ info["vocab_size"] = getattr(cfg, "vocab_size", None)
70
+ info["license"] = getattr(cfg, "license", None)
71
+ try:
72
+ info["params"] = int(model._first_module().auto_model.num_parameters())
73
+ except Exception:
74
+ pass
75
+ except Exception:
76
+ pass
77
+ return info
@@ -0,0 +1,15 @@
1
+ """The four diagnostic levels.
2
+
3
+ Each level is a pure function over a precomputed similarity matrix (and texts /
4
+ topic labels where needed), returning a dict with two keys:
5
+
6
+ - ``"metrics"``: flat, JSON-serializable summary numbers
7
+ - ``"data"``: richer arrays/records used for plotting and drill-down
8
+ """
9
+
10
+ from embedprobe.levels.level0 import signal_to_noise
11
+ from embedprobe.levels.level1 import retrieval_metrics
12
+ from embedprobe.levels.level2 import topic_structure
13
+ from embedprobe.levels.level3 import error_taxonomy
14
+
15
+ __all__ = ["signal_to_noise", "retrieval_metrics", "topic_structure", "error_taxonomy"]
@@ -0,0 +1,72 @@
1
+ """Level 0 — signal-to-noise separability.
2
+
3
+ Compares cosine similarities of true (diagonal) pairs against a matched random
4
+ sample of off-diagonal pairs. A model that has not learned cross-lingual
5
+ alignment shows heavily overlapping distributions and a KS p-value near 1.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Optional
11
+
12
+ import numpy as np
13
+ from scipy.stats import ks_2samp
14
+
15
+
16
+ def signal_to_noise(sim_matrix: np.ndarray, seed: Optional[int] = None) -> dict:
17
+ sim_matrix = np.asarray(sim_matrix)
18
+ n = sim_matrix.shape[0]
19
+ if sim_matrix.shape[0] != sim_matrix.shape[1]:
20
+ raise ValueError("Level 0 expects a square src-x-tgt similarity matrix")
21
+ if n < 2:
22
+ raise ValueError("Need at least 2 pairs for signal-to-noise analysis")
23
+
24
+ true_sims = np.diag(sim_matrix).astype(float)
25
+ off_diag = sim_matrix[~np.eye(n, dtype=bool)].astype(float)
26
+
27
+ rng = np.random.RandomState(seed)
28
+ random_sims = rng.choice(off_diag, size=n, replace=len(off_diag) < n)
29
+
30
+ ks = ks_2samp(true_sims, random_sims)
31
+ random_std = float(random_sims.std())
32
+ snr = float((true_sims.mean() - random_sims.mean()) / random_std) if random_std > 0 else float("inf")
33
+ overlap = _distribution_overlap(true_sims, random_sims)
34
+
35
+ metrics = {
36
+ "mean_true": float(true_sims.mean()),
37
+ "std_true": float(true_sims.std()),
38
+ "mean_random": float(random_sims.mean()),
39
+ "std_random": random_std,
40
+ "snr": snr,
41
+ "ks_statistic": float(ks.statistic),
42
+ "ks_p_value": float(ks.pvalue),
43
+ "p_value": float(ks.pvalue),
44
+ "overlap_fraction": overlap,
45
+ "verdict": _verdict(snr, float(ks.pvalue), overlap),
46
+ "n_pairs": int(n),
47
+ }
48
+ data = {
49
+ "true_sims": true_sims.tolist(),
50
+ "random_sims": random_sims.tolist(),
51
+ }
52
+ return {"metrics": metrics, "data": data}
53
+
54
+
55
+ def _distribution_overlap(a: np.ndarray, b: np.ndarray, bins: int = 50) -> float:
56
+ lo = float(min(a.min(), b.min()))
57
+ hi = float(max(a.max(), b.max()))
58
+ if lo == hi:
59
+ return 1.0
60
+ counts_a, edges = np.histogram(a, bins=bins, range=(lo, hi), density=True)
61
+ counts_b, _ = np.histogram(b, bins=edges, density=True)
62
+ widths = np.diff(edges)
63
+ overlap = np.minimum(counts_a, counts_b) * widths
64
+ return float(np.clip(overlap.sum(), 0.0, 1.0))
65
+
66
+
67
+ def _verdict(snr: float, p_value: float, overlap: float) -> str:
68
+ if p_value < 0.01 and snr >= 2.0 and overlap <= 0.25:
69
+ return "strong signal"
70
+ if p_value < 0.05 and snr > 0.5:
71
+ return "weak signal"
72
+ return "poor separation"
@@ -0,0 +1,58 @@
1
+ """Level 1 — retrieval performance.
2
+
3
+ For each source sentence, rank all target sentences by cosine similarity and
4
+ locate the true counterpart. Ranks are computed on the full similarity matrix,
5
+ so Recall@k, MRR and the CMC curve are exact for any k.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Sequence
11
+
12
+ import numpy as np
13
+
14
+ DEFAULT_KS = (1, 5, 10)
15
+
16
+
17
+ def true_ranks(sim_matrix: np.ndarray) -> np.ndarray:
18
+ """Rank (1-based) of the true target for each source row."""
19
+ sim_matrix = np.asarray(sim_matrix)
20
+ n = sim_matrix.shape[0]
21
+ diag = sim_matrix[np.arange(n), np.arange(n)]
22
+ # Rank = 1 + number of candidates scoring strictly higher than the true one.
23
+ return (sim_matrix > diag[:, None]).sum(axis=1) + 1
24
+
25
+
26
+ def retrieval_metrics(
27
+ sim_matrix: np.ndarray,
28
+ ks: Sequence[int] = DEFAULT_KS,
29
+ cmc_max_rank: int = 10,
30
+ ) -> dict:
31
+ sim_matrix = np.asarray(sim_matrix)
32
+ if sim_matrix.shape[0] != sim_matrix.shape[1]:
33
+ raise ValueError("Level 1 expects a square src-x-tgt similarity matrix")
34
+
35
+ ranks = true_ranks(sim_matrix)
36
+ n = len(ranks)
37
+
38
+ metrics = {"mrr": float((1.0 / ranks).mean()), "n_queries": int(n)}
39
+ for k in ks:
40
+ recall = float((ranks <= k).mean())
41
+ metrics[f"recall@{k}"] = recall
42
+ metrics[f"precision@{k}"] = recall / k
43
+
44
+ cmc_ranks = list(range(1, cmc_max_rank + 1))
45
+ cmc_values = [float((ranks <= k).mean()) for k in cmc_ranks]
46
+
47
+ data = {
48
+ "ranks": ranks.tolist(),
49
+ "cmc_ranks": cmc_ranks,
50
+ "cmc_values": cmc_values,
51
+ "similarity_histogram": _histogram(sim_matrix),
52
+ }
53
+ return {"metrics": metrics, "data": data}
54
+
55
+
56
+ def _histogram(sim_matrix: np.ndarray, bins: int = 20) -> dict:
57
+ counts, edges = np.histogram(sim_matrix.ravel(), bins=bins)
58
+ return {"counts": counts.tolist(), "bin_edges": edges.tolist()}