docdistance 1.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ from importlib.metadata import PackageNotFoundError, version
2
+
3
+ from docdistance import config # noqa: F401 (sets up logging + paths on import)
4
+ from docdistance.distance import (
5
+ DistanceResult,
6
+ SourceConditionedResult,
7
+ closeness,
8
+ compute_distance,
9
+ compute_source_conditioned,
10
+ rwmd,
11
+ smd,
12
+ wcd,
13
+ )
14
+ from docdistance.pipeline import (
15
+ DocDistance,
16
+ document_distance,
17
+ source_conditioned_distance,
18
+ )
19
+
20
+ try:
21
+ __version__ = version("docdistance")
22
+ except PackageNotFoundError: # running from source, not installed
23
+ __version__ = "0.0.0"
24
+
25
+ __all__ = [
26
+ "DocDistance",
27
+ "DistanceResult",
28
+ "SourceConditionedResult",
29
+ "document_distance",
30
+ "source_conditioned_distance",
31
+ "compute_distance",
32
+ "compute_source_conditioned",
33
+ "smd",
34
+ "wcd",
35
+ "rwmd",
36
+ "closeness",
37
+ "__version__",
38
+ ]
docdistance/cli.py ADDED
@@ -0,0 +1,229 @@
1
+ """docdistance command-line interface.
2
+
3
+ Three subcommands - ``install`` (the only one that downloads models), ``distance`` (symmetric SMD)
4
+ and ``distance-wrt-source`` (source-conditioned). Human output is rich and coloured on a capable
5
+ terminal; ``--json`` emits machine-readable JSON and ``--result-only`` emits the bare result.
6
+ Logs go to stderr (loguru, ``--verbose`` for DEBUG), so stdout carries only the result.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from enum import Enum
12
+ import json
13
+
14
+ from rich.console import Console
15
+ from rich.panel import Panel
16
+ from rich.table import Table
17
+ import typer
18
+
19
+ from docdistance.config import configure_logging
20
+ from docdistance.distance import DEFAULT_THRESHOLD
21
+
22
+ app = typer.Typer(
23
+ rich_markup_mode="rich",
24
+ no_args_is_help=True,
25
+ add_completion=False,
26
+ help="[bold]docdistance[/bold] - semantic distance between documents via Statement Mover's Distance "
27
+ "(optimal transport over mmBERT statement embeddings).",
28
+ )
29
+
30
+ _out = Console() # stdout, for the result
31
+ _err = Console(stderr=True) # stderr, for errors
32
+
33
+
34
+ class Backend(str, Enum):
35
+ openvino = "openvino"
36
+ torch = "torch"
37
+
38
+
39
+ class InstallBackend(str, Enum):
40
+ openvino = "openvino"
41
+ torch = "torch"
42
+ both = "both"
43
+
44
+
45
+ def _version_cb(value: bool):
46
+ if value:
47
+ from docdistance import __version__
48
+
49
+ typer.echo(f"docdistance {__version__}")
50
+ raise typer.Exit()
51
+
52
+
53
+ @app.callback()
54
+ def main(
55
+ version: bool = typer.Option(
56
+ False, "--version", callback=_version_cb, is_eager=True, help="show version and exit"
57
+ ),
58
+ ):
59
+ """Semantic document distance grounded in optimal-transport theory."""
60
+
61
+
62
+ def _run(fn):
63
+ """Call ``fn`` and turn a missing-model error into a clean message + exit code 1."""
64
+ from docdistance.encoders import ModelsNotInstalled
65
+
66
+ try:
67
+ return fn()
68
+ except ModelsNotInstalled as exc:
69
+ _err.print(f"[bold red]error:[/bold red] {exc}")
70
+ raise typer.Exit(1)
71
+
72
+
73
+ def _emit_distance(r, json_out: bool, result_only: bool) -> None:
74
+ if result_only:
75
+ typer.echo(str(r.smd))
76
+ return
77
+ if json_out:
78
+ typer.echo(json.dumps(r.to_dict(), indent=2))
79
+ return
80
+ color = "green" if r.verdict == "similar" else "red"
81
+ grid = Table.grid(padding=(0, 2))
82
+ grid.add_column(style="bold cyan")
83
+ grid.add_column()
84
+ grid.add_row("SMD (distance)", f"{r.smd:.4f}")
85
+ grid.add_row("closeness", f"{r.closeness * 100:.1f}%")
86
+ grid.add_row(
87
+ "verdict", f"[{color}]{r.verdict}[/{color}] (threshold {r.threshold:.2f} closeness)"
88
+ )
89
+ grid.add_row("bounds", f"WCD {r.wcd:.4f} ≤ RWMD {r.rwmd:.4f} ≤ SMD {r.smd:.4f}")
90
+ grid.add_row("statements", f"{r.n_statements_a} vs {r.n_statements_b}")
91
+ grid.add_row("anisotropy", "on" if r.anisotropy else "off")
92
+ _out.print(
93
+ Panel(grid, title="[bold]Document distance[/bold]", border_style=color, expand=False)
94
+ )
95
+
96
+
97
+ def _emit_wrt_source(r, json_out: bool, result_only: bool) -> None:
98
+ if result_only:
99
+ typer.echo(f"{r.d_sel},{r.residual_a},{r.residual_b}")
100
+ return
101
+ if json_out:
102
+ typer.echo(json.dumps(r.to_dict(), indent=2))
103
+ return
104
+ grid = Table.grid(padding=(0, 2))
105
+ grid.add_column(style="bold cyan")
106
+ grid.add_column()
107
+ grid.add_row("D_sel (selection divergence)", f"{r.d_sel:.4f}")
108
+ grid.add_row("A → source", f"{r.residual_a:.4f} (closeness {r.closeness_a * 100:.1f}%)")
109
+ grid.add_row("B → source", f"{r.residual_b:.4f} (closeness {r.closeness_b * 100:.1f}%)")
110
+ grid.add_row(
111
+ "statements", f"A {r.n_statements_a} / B {r.n_statements_b} / S {r.n_statements_source}"
112
+ )
113
+ _out.print(
114
+ Panel(
115
+ grid,
116
+ title="[bold]Source-conditioned distance d(A,B|S)[/bold]",
117
+ border_style="cyan",
118
+ expand=False,
119
+ )
120
+ )
121
+ _out.print(
122
+ "[dim]residual = geometric distance to the source; the reranker + NLI grounding grade and "
123
+ "numeric verifier are deferred to E02[/dim]"
124
+ )
125
+
126
+
127
+ @app.command(
128
+ epilog="[bold]Examples[/bold]\n\n"
129
+ " docdistance distance report_v1.md report_v2.md\n"
130
+ ' docdistance distance "first text" "second text" --backend torch\n'
131
+ " docdistance distance a.md b.md --json\n"
132
+ " docdistance distance a.md b.md --result-only"
133
+ )
134
+ def distance(
135
+ a: str = typer.Argument(..., help="first document - a file path or raw text"),
136
+ b: str = typer.Argument(..., help="second document - a file path or raw text"),
137
+ backend: Backend = typer.Option(
138
+ Backend.openvino, "--backend", help="statement encoder backend"
139
+ ),
140
+ anisotropy: bool = typer.Option(
141
+ False,
142
+ "--anisotropy/--no-anisotropy",
143
+ help="all-but-the-top anisotropy removal - needs a corpus, off by default for a pair",
144
+ ),
145
+ threshold: float = typer.Option(
146
+ DEFAULT_THRESHOLD,
147
+ "--threshold",
148
+ help="closeness cutoff for the similar / not-similar verdict",
149
+ ),
150
+ json_out: bool = typer.Option(False, "--json", help="machine-readable JSON to stdout"),
151
+ result_only: bool = typer.Option(
152
+ False, "--result-only", help="bare SMD scalar to stdout, no clutter"
153
+ ),
154
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="DEBUG logging to stderr"),
155
+ ):
156
+ """Symmetric distance between two documents - the exact Statement Mover's Distance."""
157
+ configure_logging(verbose)
158
+ from docdistance.pipeline import document_distance
159
+
160
+ result = _run(
161
+ lambda: document_distance(
162
+ a, b, backend=backend.value, anisotropy=anisotropy, threshold=threshold
163
+ )
164
+ )
165
+ _emit_distance(result, json_out, result_only)
166
+
167
+
168
+ @app.command(
169
+ name="distance-wrt-source",
170
+ epilog="[bold]Examples[/bold]\n\n"
171
+ " docdistance distance-wrt-source summary_a.md summary_b.md --source article.md\n"
172
+ " docdistance distance-wrt-source a.md b.md -s s.md --json\n"
173
+ " docdistance distance-wrt-source a.md b.md -s s.md --result-only [dim]# D_sel,res_a,res_b[/dim]",
174
+ )
175
+ def distance_wrt_source(
176
+ a: str = typer.Argument(..., help="first document - a file path or raw text"),
177
+ b: str = typer.Argument(..., help="second document - a file path or raw text"),
178
+ source: str = typer.Option(..., "--source", "-s", help="the common source document"),
179
+ backend: Backend = typer.Option(
180
+ Backend.openvino, "--backend", help="statement encoder backend"
181
+ ),
182
+ anisotropy: bool = typer.Option(
183
+ False,
184
+ "--anisotropy/--no-anisotropy",
185
+ help="anisotropy removal - needs a corpus, off by default",
186
+ ),
187
+ json_out: bool = typer.Option(False, "--json", help="machine-readable JSON to stdout"),
188
+ result_only: bool = typer.Option(
189
+ False, "--result-only", help="bare comma-separated D_sel,residual_a,residual_b to stdout"
190
+ ),
191
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="DEBUG logging to stderr"),
192
+ ):
193
+ """Source-conditioned distance d(A, B | S) - selection divergence plus each document's distance to S."""
194
+ configure_logging(verbose)
195
+ from docdistance.pipeline import source_conditioned_distance
196
+
197
+ result = _run(
198
+ lambda: source_conditioned_distance(
199
+ a, b, source, backend=backend.value, anisotropy=anisotropy
200
+ )
201
+ )
202
+ _emit_wrt_source(result, json_out, result_only)
203
+
204
+
205
+ @app.command(
206
+ epilog="[bold]Examples[/bold]\n\n"
207
+ " docdistance install [dim]# both backends[/dim]\n"
208
+ " docdistance install --backend openvino",
209
+ )
210
+ def install(
211
+ backend: InstallBackend = typer.Option(
212
+ InstallBackend.both, "--backend", help="which encoder weights to fetch"
213
+ ),
214
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="DEBUG logging to stderr"),
215
+ ):
216
+ """Download and cache the models - the only command that fetches from the Hub (TQDM progress bars)."""
217
+ configure_logging(verbose)
218
+ from docdistance.encoders import ModelsNotInstalled, download_models
219
+
220
+ try:
221
+ backends = download_models(backend.value)
222
+ except ModelsNotInstalled as exc:
223
+ _err.print(f"[bold red]error:[/bold red] {exc}")
224
+ raise typer.Exit(1)
225
+ _out.print(f"[green]models ready:[/green] {', '.join(backends)}")
226
+
227
+
228
+ if __name__ == "__main__":
229
+ app()
docdistance/config.py ADDED
@@ -0,0 +1,66 @@
1
+ from pathlib import Path
2
+ import sys
3
+
4
+ from dotenv import load_dotenv
5
+ from loguru import logger
6
+
7
+ ########### SETUP ###############
8
+
9
+ # set up logger - INFO by default (DEBUG only via the CLI --verbose flag), sink to stderr so
10
+ # stdout stays clean for --json / --result-only output
11
+ logger.remove()
12
+ logger.add(sys.stderr, colorize=True, level="INFO")
13
+
14
+ # If tqdm is installed, configure loguru with tqdm.write
15
+ # https://github.com/Delgan/loguru/issues/135
16
+ try:
17
+ from tqdm import tqdm
18
+
19
+ logger.remove()
20
+ logger.add(lambda msg: tqdm.write(msg, end="", file=sys.stderr), colorize=True, level="INFO")
21
+ except ModuleNotFoundError:
22
+ pass
23
+
24
+ ########## VARIABLES ############
25
+
26
+ # Load environment variables from .env file if it exists
27
+ load_dotenv()
28
+
29
+ # paths
30
+ PROJ_ROOT = Path(__file__).resolve().parents[2]
31
+ DATA_DIR = PROJ_ROOT / "data"
32
+ RAW_DATA_DIR = DATA_DIR / "raw"
33
+ INTERIM_DATA_DIR = DATA_DIR / "interim"
34
+ PROCESSED_DATA_DIR = DATA_DIR / "processed"
35
+ EXTERNAL_DATA_DIR = DATA_DIR / "external"
36
+ MODELS_DIR = PROJ_ROOT / "models"
37
+ REPORTS_DIR = PROJ_ROOT / "reports"
38
+ FIGURES_DIR = REPORTS_DIR / "figures"
39
+
40
+ # log current root dir (debug so it never pollutes machine-readable stdout)
41
+ logger.debug(f"PROJ_ROOT path is: {PROJ_ROOT}")
42
+
43
+ ########## MODELS ###############
44
+
45
+ # segmenter (wtpsplit SaT) and the mmBERT statement encoders, by backend
46
+ SAT_MODEL = "sat-3l-sm"
47
+ MMBERT_TORCH_MODEL = "jhu-clsp/mmBERT-base"
48
+ MMBERT_OPENVINO_LOCAL = MODELS_DIR / "02-mmbert-openvino-int8"
49
+ MMBERT_OPENVINO_HF = "stellars/mmBERT-base-openvino-int8"
50
+
51
+
52
+ def configure_logging(verbose: bool = False) -> None:
53
+ """Re-point loguru at stderr at INFO, or DEBUG when ``verbose`` - the CLI calls this first.
54
+
55
+ stderr keeps stdout reserved for the result so ``--json`` and ``--result-only`` stay machine-parseable.
56
+ """
57
+ level = "DEBUG" if verbose else "INFO"
58
+ logger.remove()
59
+ try:
60
+ from tqdm import tqdm
61
+
62
+ logger.add(
63
+ lambda msg: tqdm.write(msg, end="", file=sys.stderr), colorize=True, level=level
64
+ )
65
+ except ModuleNotFoundError:
66
+ logger.add(sys.stderr, colorize=True, level=level)
docdistance/dataset.py ADDED
@@ -0,0 +1,29 @@
1
+ from pathlib import Path
2
+
3
+ from loguru import logger
4
+ from tqdm import tqdm
5
+ import typer
6
+
7
+ from docdistance.config import PROCESSED_DATA_DIR, RAW_DATA_DIR
8
+
9
+ app = typer.Typer()
10
+
11
+
12
+ @app.command()
13
+ def main(
14
+ # ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
15
+ input_path: Path = RAW_DATA_DIR / "dataset.csv",
16
+ output_path: Path = PROCESSED_DATA_DIR / "dataset.csv",
17
+ # ----------------------------------------------
18
+ ):
19
+ # ---- REPLACE THIS WITH YOUR OWN CODE ----
20
+ logger.info("Processing dataset...")
21
+ for i in tqdm(range(10), total=10):
22
+ if i == 5:
23
+ logger.info("Something happened for iteration 5.")
24
+ logger.success("Processing dataset complete.")
25
+ # -----------------------------------------
26
+
27
+
28
+ if __name__ == "__main__":
29
+ app()
@@ -0,0 +1,231 @@
1
+ """Pure-numpy optimal-transport core for document distance.
2
+
3
+ No heavy ML dependencies - only numpy and POT (``ot``). Segmentation and embedding live in
4
+ ``encoders.py``; this module operates on statement-embedding arrays: L2-normalized float32 of
5
+ shape ``[n_statements, dim]``. Every function here is deterministic and CPU-only, which is why
6
+ the unit tests can exercise it without loading a single model.
7
+
8
+ The distance is the exact Statement Mover's Distance (SMD) - optimal transport between two
9
+ statement clouds with the metric ground cost ``sqrt(2 - 2cos)`` (Euclidean on L2-normalized
10
+ vectors). ``wcd`` and ``rwmd`` are the cheap lower bounds (``WCD <= RWMD <= SMD``). The
11
+ source-conditioned helpers re-base the transport onto a common source ``S``.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from dataclasses import asdict, dataclass
17
+
18
+ import numpy as np
19
+ import ot
20
+
21
+ # orthogonal statement clouds -> closeness 0; cos >= 0 for these embeddings so distance lands in [0, sqrt(2)]
22
+ SMD_MAX = float(np.sqrt(2.0))
23
+
24
+ # closeness cutoff for the similar / not-similar verdict; heuristic, calibrate per corpus
25
+ # (measured boundary on the ibm-ai-adoption fixtures: min gold 72.7% vs max adversarial 72.2%)
26
+ DEFAULT_THRESHOLD = 0.725
27
+
28
+
29
+ def cost_matrix(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
30
+ """Ground cost ``sqrt(2 - 2cos)`` = Euclidean distance on L2-normalized rows (a metric)."""
31
+ return ot.dist(X, Y, metric="euclidean")
32
+
33
+
34
+ def _uniform(n: int) -> np.ndarray:
35
+ return np.full(n, 1.0 / n)
36
+
37
+
38
+ def _ab(X: np.ndarray, Y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
39
+ return _uniform(len(X)), _uniform(len(Y))
40
+
41
+
42
+ def smd(X: np.ndarray, Y: np.ndarray) -> float:
43
+ """The distance: exact Statement Mover's Distance via the network-simplex LP."""
44
+ return float(ot.emd2(*_ab(X, Y), cost_matrix(X, Y)))
45
+
46
+
47
+ def wcd(X: np.ndarray, Y: np.ndarray) -> float:
48
+ """Lower bound: distance between the mean-pooled statement clouds (whole-doc cosine)."""
49
+ return float(np.linalg.norm(X.mean(0) - Y.mean(0)))
50
+
51
+
52
+ def rwmd(X: np.ndarray, Y: np.ndarray) -> float:
53
+ """Lower bound: one-sided relaxation (greedy nearest-statement alignment)."""
54
+ a, b = _ab(X, Y)
55
+ C = cost_matrix(X, Y)
56
+ return float(max((a * C.min(1)).sum(), (b * C.min(0)).sum()))
57
+
58
+
59
+ def closeness(d: float) -> float:
60
+ """Map a distance to a 0-1 similarity: 1 = identical clouds, 0 = orthogonal."""
61
+ return max(0.0, 1.0 - d / SMD_MAX)
62
+
63
+
64
+ def verdict(close: float, threshold: float = DEFAULT_THRESHOLD) -> str:
65
+ return "similar" if close >= threshold else "not similar"
66
+
67
+
68
+ def all_but_the_top(emb: dict[str, np.ndarray], k: int = 1) -> dict[str, np.ndarray]:
69
+ """All-but-the-top anisotropy removal (Mu & Viswanath, ICLR 2018).
70
+
71
+ Subtract the pooled mean and project out the top-``k`` principal components (via SVD of the
72
+ mean-centered matrix), then re-L2-normalize. De-bunches the anisotropic mmBERT cosines and
73
+ widens the distance dynamic range while preserving statement ordering. Operates over the
74
+ pooled statements of all documents in ``emb`` so the common direction is shared.
75
+ """
76
+ keys = list(emb)
77
+ pool = np.concatenate([emb[key] for key in keys], 0)
78
+ centered = pool - pool.mean(0)
79
+ _, _, Vt = np.linalg.svd(centered, full_matrices=False)
80
+ comps = Vt[:k]
81
+ fixed = centered - centered @ comps.T @ comps
82
+ fixed = fixed / (np.linalg.norm(fixed, axis=1, keepdims=True) + 1e-9)
83
+ out: dict[str, np.ndarray] = {}
84
+ off = 0
85
+ for key in keys:
86
+ m = len(emb[key])
87
+ out[key] = fixed[off : off + m].astype(np.float32)
88
+ off += m
89
+ return out
90
+
91
+
92
+ # --- source-conditioned core (metric parts only; reranker/NLI grounding deferred to E02) ---
93
+
94
+
95
+ COVERAGE_TEMPERATURE = 0.1
96
+
97
+
98
+ def coverage_profile(
99
+ X: np.ndarray, S: np.ndarray, temperature: float = COVERAGE_TEMPERATURE
100
+ ) -> np.ndarray:
101
+ """How document ``X``'s statements distribute over the source statements ``S``; a distribution, sums to 1.
102
+
103
+ Each statement softly assigns to source statements by ``softmax(-cost / temperature)`` and the
104
+ profile is the mean assignment over ``X``. A balanced-OT column marginal is forced uniform (the
105
+ transport constraint), so it carries no per-document signal; this soft nearest-source histogram
106
+ varies by document and captures which source content each one covers.
107
+ """
108
+ C = cost_matrix(X, S)
109
+ A = np.exp(
110
+ -(C - C.min(1, keepdims=True)) / temperature
111
+ ) # subtract row-min for numerical stability
112
+ A = A / A.sum(1, keepdims=True)
113
+ return A.mean(0)
114
+
115
+
116
+ def selection_divergence(cov_a: np.ndarray, cov_b: np.ndarray, S: np.ndarray) -> float:
117
+ """D_sel: metric OT between two coverage profiles over the shared source statements.
118
+
119
+ Ground cost is ``sqrt(2 - 2cos)`` on the source-statement embeddings, so D_sel stays a metric -
120
+ the guarantee conditioning on a fixed ``S`` buys back. Captures same source, different picks.
121
+ """
122
+ return float(ot.emd2(np.asarray(cov_a), np.asarray(cov_b), cost_matrix(S, S)))
123
+
124
+
125
+ def ungrounded_residual(X: np.ndarray, S: np.ndarray) -> float:
126
+ """Per-document grounding proxy: the transport cost SMD(X, S) (distance of X to the source).
127
+
128
+ A coarse, metric stand-in for the grounding residual - higher means the document drifts
129
+ further from any source statement. The reranker + NLI grade that separates contradiction from
130
+ omission is deferred to E02; this is the geometric distance-to-source only.
131
+ """
132
+ return smd(X, S)
133
+
134
+
135
+ @dataclass
136
+ class DistanceResult:
137
+ """Symmetric distance result. ``smd`` is the distance; ``wcd``/``rwmd`` bound it below."""
138
+
139
+ smd: float
140
+ wcd: float
141
+ rwmd: float
142
+ closeness: float
143
+ threshold: float
144
+ verdict: str
145
+ anisotropy: bool
146
+ n_statements_a: int
147
+ n_statements_b: int
148
+
149
+ def to_dict(self) -> dict:
150
+ return asdict(self)
151
+
152
+
153
+ @dataclass
154
+ class SourceConditionedResult:
155
+ """Source-conditioned result: selection divergence plus each document's distance to the source."""
156
+
157
+ d_sel: float
158
+ residual_a: float
159
+ residual_b: float
160
+ closeness_a: float
161
+ closeness_b: float
162
+ n_statements_a: int
163
+ n_statements_b: int
164
+ n_statements_source: int
165
+ coverage_a: list[float]
166
+ coverage_b: list[float]
167
+
168
+ def to_dict(self) -> dict:
169
+ return asdict(self)
170
+
171
+
172
+ def compute_distance(
173
+ emb_a: np.ndarray,
174
+ emb_b: np.ndarray,
175
+ *,
176
+ anisotropy: bool = False,
177
+ threshold: float = DEFAULT_THRESHOLD,
178
+ ) -> DistanceResult:
179
+ """Assemble a :class:`DistanceResult` from two statement-embedding arrays.
180
+
181
+ ``anisotropy`` is off by default: all-but-the-top estimates the shared direction from a corpus,
182
+ so over a single document pair it strips genuine shared meaning and distorts the scale. The
183
+ validated nb04 verdict uses raw embeddings; enable anisotropy only over a pooled corpus.
184
+ """
185
+ n_a, n_b = len(emb_a), len(emb_b)
186
+ if anisotropy:
187
+ fixed = all_but_the_top({"a": emb_a, "b": emb_b}, k=1)
188
+ emb_a, emb_b = fixed["a"], fixed["b"]
189
+ d = smd(emb_a, emb_b)
190
+ close = closeness(d)
191
+ return DistanceResult(
192
+ smd=d,
193
+ wcd=wcd(emb_a, emb_b),
194
+ rwmd=rwmd(emb_a, emb_b),
195
+ closeness=close,
196
+ threshold=threshold,
197
+ verdict=verdict(close, threshold),
198
+ anisotropy=anisotropy,
199
+ n_statements_a=n_a,
200
+ n_statements_b=n_b,
201
+ )
202
+
203
+
204
+ def compute_source_conditioned(
205
+ emb_a: np.ndarray,
206
+ emb_b: np.ndarray,
207
+ emb_source: np.ndarray,
208
+ *,
209
+ anisotropy: bool = False,
210
+ ) -> SourceConditionedResult:
211
+ """Assemble a :class:`SourceConditionedResult` from A, B and source statement embeddings."""
212
+ n_a, n_b, n_s = len(emb_a), len(emb_b), len(emb_source)
213
+ if anisotropy:
214
+ fixed = all_but_the_top({"a": emb_a, "b": emb_b, "s": emb_source}, k=1)
215
+ emb_a, emb_b, emb_source = fixed["a"], fixed["b"], fixed["s"]
216
+ cov_a = coverage_profile(emb_a, emb_source)
217
+ cov_b = coverage_profile(emb_b, emb_source)
218
+ res_a = ungrounded_residual(emb_a, emb_source)
219
+ res_b = ungrounded_residual(emb_b, emb_source)
220
+ return SourceConditionedResult(
221
+ d_sel=selection_divergence(cov_a, cov_b, emb_source),
222
+ residual_a=res_a,
223
+ residual_b=res_b,
224
+ closeness_a=closeness(res_a),
225
+ closeness_b=closeness(res_b),
226
+ n_statements_a=n_a,
227
+ n_statements_b=n_b,
228
+ n_statements_source=n_s,
229
+ coverage_a=cov_a.tolist(),
230
+ coverage_b=cov_b.tolist(),
231
+ )
@@ -0,0 +1,229 @@
1
+ """Model-backed segmentation and embedding.
2
+
3
+ The heavy dependencies (torch, transformers, openvino, wtpsplit, huggingface_hub) are imported
4
+ lazily inside the functions, so the pure-numpy :mod:`docdistance.distance` core stays
5
+ importable - and unit-testable - without them.
6
+
7
+ Inference never downloads. The constructors set ``HF_HUB_OFFLINE=1``; a model missing from the
8
+ cache raises :class:`ModelsNotInstalled` pointing at ``docdistance install``. Downloading happens
9
+ only in :func:`download_models`, which the ``install`` CLI command calls (TQDM progress bars come
10
+ from huggingface_hub).
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import contextlib
16
+ import io
17
+ import os
18
+ from pathlib import Path
19
+
20
+ from loguru import logger
21
+ import numpy as np
22
+
23
+ from docdistance import config
24
+
25
+ # keep transformers quiet before it is ever imported - it otherwise prints a model LOAD REPORT
26
+ # and advisory warnings (e.g. dropped LM-head keys) that leak past stderr redirection
27
+ os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
28
+ os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")
29
+
30
+ EMBED_BATCH = 64
31
+ MAX_TOKENS = 128
32
+
33
+ _INSTALL_HINT = "model not found in cache - run: docdistance install"
34
+ _EXTRA_HINT = "model dependencies missing - reinstall: pip install --force-reinstall docdistance"
35
+
36
+
37
+ class ModelsNotInstalled(RuntimeError):
38
+ """A required model is missing from the cache - run ``docdistance install``."""
39
+
40
+
41
+ def _require_models_extra() -> None:
42
+ try:
43
+ import transformers
44
+ import wtpsplit # noqa: F401
45
+ except ModuleNotFoundError as exc:
46
+ raise ModelsNotInstalled(_EXTRA_HINT) from exc
47
+ # belt-and-suspenders alongside the env vars: silence the LOAD REPORT / modeling logger
48
+ import logging as _logging
49
+
50
+ transformers.logging.set_verbosity_error()
51
+ _logging.getLogger("transformers.modeling_utils").setLevel(_logging.ERROR)
52
+
53
+
54
+ def _set_hf_token() -> None:
55
+ """Map the project's vault token (HF_AUTH_TOKEN, loaded from .env by config) to HF_TOKEN."""
56
+ if os.environ.get("HF_AUTH_TOKEN") and not os.environ.get("HF_TOKEN"):
57
+ os.environ["HF_TOKEN"] = os.environ["HF_AUTH_TOKEN"]
58
+ os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
59
+ os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
60
+
61
+
62
+ class Segmenter:
63
+ """SAT statement segmenter (wtpsplit ``sat-3l-sm``), CPU."""
64
+
65
+ def __init__(self, offline: bool = True):
66
+ _require_models_extra()
67
+ _set_hf_token()
68
+ if offline:
69
+ os.environ.setdefault("HF_HUB_OFFLINE", "1")
70
+ with contextlib.redirect_stderr(io.StringIO()):
71
+ from wtpsplit import SaT
72
+
73
+ try:
74
+ self._sat = SaT(config.SAT_MODEL)
75
+ except Exception as exc: # missing weights under offline mode
76
+ raise ModelsNotInstalled(_INSTALL_HINT) from exc
77
+ logger.debug("loaded SAT segmenter '{}'", config.SAT_MODEL)
78
+
79
+ def split(self, text: str) -> list[str]:
80
+ with contextlib.redirect_stderr(io.StringIO()):
81
+ return [s.strip() for s in self._sat.split(text) if s.strip()]
82
+
83
+
84
+ class OpenVINOEncoder:
85
+ """mmBERT INT8 OpenVINO encoder (CPU). Mean-pooled, L2-normalized statement embeddings."""
86
+
87
+ name = "openvino"
88
+
89
+ def __init__(self, offline: bool = True):
90
+ _require_models_extra()
91
+ _set_hf_token()
92
+ import openvino as ov
93
+ from transformers import AutoTokenizer
94
+
95
+ src = config.MMBERT_OPENVINO_LOCAL
96
+ if not (src / "openvino_model.xml").exists():
97
+ if offline:
98
+ os.environ.setdefault("HF_HUB_OFFLINE", "1")
99
+ from huggingface_hub import snapshot_download
100
+
101
+ try:
102
+ src = Path(snapshot_download(config.MMBERT_OPENVINO_HF))
103
+ except Exception as exc:
104
+ raise ModelsNotInstalled(_INSTALL_HINT) from exc
105
+ core = ov.Core()
106
+ model = core.read_model(str(src / "openvino_model.xml"))
107
+ # 2nd input name is dropped to '74' (attention_mask) during conversion - feed positionally
108
+ self._innames = [i.get_any_name() for i in model.inputs]
109
+ self._cm = core.compile_model(model, "CPU", {"PERFORMANCE_HINT": "LATENCY"})
110
+ self._tok = AutoTokenizer.from_pretrained(str(src))
111
+ logger.debug("loaded OpenVINO INT8 encoder from {}", src)
112
+
113
+ def encode(self, sents: list[str]) -> np.ndarray:
114
+ out = []
115
+ for i in range(0, len(sents), EMBED_BATCH):
116
+ batch = sents[i : i + EMBED_BATCH]
117
+ enc = self._tok(
118
+ batch, padding=True, truncation=True, max_length=MAX_TOKENS, return_tensors="np"
119
+ )
120
+ feeds = {self._innames[0]: enc["input_ids"], self._innames[1]: enc["attention_mask"]}
121
+ hidden = self._cm(feeds)[self._cm.output(0)]
122
+ mask = enc["attention_mask"][..., None].astype("float32")
123
+ pooled = (hidden * mask).sum(1) / np.clip(mask.sum(1), 1, None)
124
+ out.append(
125
+ (pooled / (np.linalg.norm(pooled, axis=1, keepdims=True) + 1e-9)).astype(
126
+ np.float32
127
+ )
128
+ )
129
+ return np.concatenate(out, 0)
130
+
131
+
132
+ class TorchEncoder:
133
+ """mmBERT PyTorch encoder (GPU bf16 if available, else CPU fp32)."""
134
+
135
+ name = "torch"
136
+
137
+ def __init__(self, offline: bool = True, device: str | None = None):
138
+ _require_models_extra()
139
+ _set_hf_token()
140
+ if offline:
141
+ os.environ.setdefault("HF_HUB_OFFLINE", "1")
142
+ import torch
143
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
144
+
145
+ self._torch = torch
146
+ self._dev = device or ("cuda" if torch.cuda.is_available() else "cpu")
147
+ with contextlib.redirect_stderr(io.StringIO()):
148
+ conf = AutoConfig.from_pretrained(config.MMBERT_TORCH_MODEL)
149
+ conf.reference_compile = False # avoid the ModernBERT first-forward torch.compile hang
150
+ try:
151
+ self._tok = AutoTokenizer.from_pretrained(config.MMBERT_TORCH_MODEL)
152
+ enc = AutoModel.from_pretrained(
153
+ config.MMBERT_TORCH_MODEL, config=conf, attn_implementation="eager"
154
+ )
155
+ except Exception as exc:
156
+ raise ModelsNotInstalled(_INSTALL_HINT) from exc
157
+ dtype = torch.bfloat16 if self._dev == "cuda" else torch.float32
158
+ self._enc = enc.to(self._dev).to(dtype).eval()
159
+ logger.debug("loaded Torch encoder '{}' on {}", config.MMBERT_TORCH_MODEL, self._dev)
160
+
161
+ def encode(self, sents: list[str]) -> np.ndarray:
162
+ torch = self._torch
163
+ out = []
164
+ with torch.no_grad():
165
+ for i in range(0, len(sents), EMBED_BATCH):
166
+ batch = sents[i : i + EMBED_BATCH]
167
+ enc = self._tok(
168
+ batch,
169
+ padding=True,
170
+ truncation=True,
171
+ max_length=MAX_TOKENS,
172
+ return_tensors="pt",
173
+ ).to(self._dev)
174
+ hidden = self._enc(**enc).last_hidden_state.float()
175
+ mask = enc["attention_mask"].unsqueeze(-1).float()
176
+ pooled = (hidden * mask).sum(1) / mask.sum(1).clamp(min=1)
177
+ pooled = torch.nn.functional.normalize(pooled, dim=1)
178
+ out.append(pooled.cpu().numpy().astype(np.float32))
179
+ return np.concatenate(out, 0)
180
+
181
+
182
+ def load_encoder(backend: str = "openvino", offline: bool = True):
183
+ """Factory: return an encoder for ``backend`` in {openvino, torch}."""
184
+ if backend == "openvino":
185
+ return OpenVINOEncoder(offline=offline)
186
+ if backend == "torch":
187
+ return TorchEncoder(offline=offline)
188
+ raise ValueError(f"unknown backend {backend!r}; choose 'openvino' or 'torch'")
189
+
190
+
191
+ def download_models(backend: str = "openvino") -> list[str]:
192
+ """Download and cache the models for ``backend`` in {openvino, torch, both}.
193
+
194
+ The only function that fetches from the Hub. huggingface_hub renders TQDM download bars.
195
+ """
196
+ _require_models_extra()
197
+ _set_hf_token()
198
+ os.environ.pop("HF_HUB_OFFLINE", None) # force online for the install step
199
+
200
+ logger.info("downloading SAT segmenter '{}'", config.SAT_MODEL)
201
+ with contextlib.redirect_stderr(io.StringIO()):
202
+ from wtpsplit import SaT
203
+
204
+ SaT(config.SAT_MODEL)
205
+
206
+ backends = ["openvino", "torch"] if backend == "both" else [backend]
207
+ if "openvino" in backends:
208
+ if (config.MMBERT_OPENVINO_LOCAL / "openvino_model.xml").exists():
209
+ logger.info(
210
+ "openvino INT8 encoder already present at {}", config.MMBERT_OPENVINO_LOCAL
211
+ )
212
+ else:
213
+ logger.info("downloading '{}'", config.MMBERT_OPENVINO_HF)
214
+ from huggingface_hub import snapshot_download
215
+
216
+ snapshot_download(config.MMBERT_OPENVINO_HF)
217
+ if "torch" in backends:
218
+ logger.info("downloading '{}'", config.MMBERT_TORCH_MODEL)
219
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
220
+
221
+ conf = AutoConfig.from_pretrained(config.MMBERT_TORCH_MODEL)
222
+ conf.reference_compile = False
223
+ AutoTokenizer.from_pretrained(config.MMBERT_TORCH_MODEL)
224
+ AutoModel.from_pretrained(
225
+ config.MMBERT_TORCH_MODEL, config=conf, attn_implementation="eager"
226
+ )
227
+
228
+ logger.success("models ready for backend(s): {}", ", ".join(backends))
229
+ return backends
@@ -0,0 +1,29 @@
1
+ from pathlib import Path
2
+
3
+ from loguru import logger
4
+ from tqdm import tqdm
5
+ import typer
6
+
7
+ from docdistance.config import PROCESSED_DATA_DIR
8
+
9
+ app = typer.Typer()
10
+
11
+
12
+ @app.command()
13
+ def main(
14
+ # ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
15
+ input_path: Path = PROCESSED_DATA_DIR / "dataset.csv",
16
+ output_path: Path = PROCESSED_DATA_DIR / "features.csv",
17
+ # -----------------------------------------
18
+ ):
19
+ # ---- REPLACE THIS WITH YOUR OWN CODE ----
20
+ logger.info("Generating features from dataset...")
21
+ for i in tqdm(range(10), total=10):
22
+ if i == 5:
23
+ logger.info("Something happened for iteration 5.")
24
+ logger.success("Features generation complete.")
25
+ # -----------------------------------------
26
+
27
+
28
+ if __name__ == "__main__":
29
+ app()
File without changes
@@ -0,0 +1,30 @@
1
+ from pathlib import Path
2
+
3
+ from loguru import logger
4
+ from tqdm import tqdm
5
+ import typer
6
+
7
+ from docdistance.config import MODELS_DIR, PROCESSED_DATA_DIR
8
+
9
+ app = typer.Typer()
10
+
11
+
12
+ @app.command()
13
+ def main(
14
+ # ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
15
+ features_path: Path = PROCESSED_DATA_DIR / "test_features.csv",
16
+ model_path: Path = MODELS_DIR / "model.pkl",
17
+ predictions_path: Path = PROCESSED_DATA_DIR / "test_predictions.csv",
18
+ # -----------------------------------------
19
+ ):
20
+ # ---- REPLACE THIS WITH YOUR OWN CODE ----
21
+ logger.info("Performing inference for model...")
22
+ for i in tqdm(range(10), total=10):
23
+ if i == 5:
24
+ logger.info("Something happened for iteration 5.")
25
+ logger.success("Inference complete.")
26
+ # -----------------------------------------
27
+
28
+
29
+ if __name__ == "__main__":
30
+ app()
@@ -0,0 +1,30 @@
1
+ from pathlib import Path
2
+
3
+ from loguru import logger
4
+ from tqdm import tqdm
5
+ import typer
6
+
7
+ from docdistance.config import MODELS_DIR, PROCESSED_DATA_DIR
8
+
9
+ app = typer.Typer()
10
+
11
+
12
+ @app.command()
13
+ def main(
14
+ # ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
15
+ features_path: Path = PROCESSED_DATA_DIR / "features.csv",
16
+ labels_path: Path = PROCESSED_DATA_DIR / "labels.csv",
17
+ model_path: Path = MODELS_DIR / "model.pkl",
18
+ # -----------------------------------------
19
+ ):
20
+ # ---- REPLACE THIS WITH YOUR OWN CODE ----
21
+ logger.info("Training some model...")
22
+ for i in tqdm(range(10), total=10):
23
+ if i == 5:
24
+ logger.info("Something happened for iteration 5.")
25
+ logger.success("Modeling training complete.")
26
+ # -----------------------------------------
27
+
28
+
29
+ if __name__ == "__main__":
30
+ app()
@@ -0,0 +1,115 @@
1
+ """High-level document-distance API.
2
+
3
+ Two entry styles:
4
+
5
+ - :class:`DocDistance` - load the models once, score many pairs (the pipeline-integration entry)
6
+ - :func:`document_distance` / :func:`source_conditioned_distance` - one-shot convenience that loads
7
+ and scores in a single call
8
+
9
+ Inputs are raw text or a path to a text/markdown file (auto-detected). A leading markdown ``# `` title
10
+ line is stripped from files so the document title does not count as a statement.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from pathlib import Path
16
+
17
+ import numpy as np
18
+
19
+ from docdistance import distance as _core
20
+ from docdistance.distance import (
21
+ DEFAULT_THRESHOLD,
22
+ DistanceResult,
23
+ SourceConditionedResult,
24
+ )
25
+ from docdistance.encoders import Segmenter, load_encoder
26
+
27
+
28
+ def _load_body(path: Path) -> str:
29
+ lines = path.read_text().splitlines()
30
+ return "\n".join(ln for ln in lines if not ln.startswith("# ")).strip()
31
+
32
+
33
+ def _read(doc: str | Path) -> str:
34
+ """Resolve a document argument: an existing file path is read, anything else is treated as text."""
35
+ if isinstance(doc, Path):
36
+ return _load_body(doc)
37
+ if isinstance(doc, str):
38
+ try:
39
+ p = Path(doc)
40
+ if p.exists() and p.is_file():
41
+ return _load_body(p)
42
+ except OSError:
43
+ pass
44
+ return doc.strip()
45
+ raise TypeError(f"document must be str or Path, got {type(doc).__name__}")
46
+
47
+
48
+ class DocDistance:
49
+ """Reusable pipeline - construct once (models load here), then call :meth:`distance` per pair."""
50
+
51
+ def __init__(self, backend: str = "openvino", offline: bool = True):
52
+ self.backend = backend
53
+ self.segmenter = Segmenter(offline=offline)
54
+ self.encoder = load_encoder(backend, offline=offline)
55
+
56
+ def embed(self, doc: str | Path) -> np.ndarray:
57
+ """Segment then embed a document into L2-normalized statement vectors ``[n, dim]``."""
58
+ statements = self.segmenter.split(_read(doc))
59
+ if not statements:
60
+ raise ValueError("document produced no statements")
61
+ return self.encoder.encode(statements)
62
+
63
+ def distance(
64
+ self,
65
+ a: str | Path,
66
+ b: str | Path,
67
+ *,
68
+ anisotropy: bool = False,
69
+ threshold: float = DEFAULT_THRESHOLD,
70
+ ) -> DistanceResult:
71
+ return _core.compute_distance(
72
+ self.embed(a), self.embed(b), anisotropy=anisotropy, threshold=threshold
73
+ )
74
+
75
+ def distance_wrt_source(
76
+ self,
77
+ a: str | Path,
78
+ b: str | Path,
79
+ source: str | Path,
80
+ *,
81
+ anisotropy: bool = False,
82
+ ) -> SourceConditionedResult:
83
+ return _core.compute_source_conditioned(
84
+ self.embed(a), self.embed(b), self.embed(source), anisotropy=anisotropy
85
+ )
86
+
87
+
88
+ def document_distance(
89
+ a: str | Path,
90
+ b: str | Path,
91
+ *,
92
+ backend: str = "openvino",
93
+ anisotropy: bool = False,
94
+ threshold: float = DEFAULT_THRESHOLD,
95
+ offline: bool = True,
96
+ ) -> DistanceResult:
97
+ """Symmetric Statement Mover's Distance between documents ``a`` and ``b`` (loads models, then scores)."""
98
+ return DocDistance(backend=backend, offline=offline).distance(
99
+ a, b, anisotropy=anisotropy, threshold=threshold
100
+ )
101
+
102
+
103
+ def source_conditioned_distance(
104
+ a: str | Path,
105
+ b: str | Path,
106
+ source: str | Path,
107
+ *,
108
+ backend: str = "openvino",
109
+ anisotropy: bool = False,
110
+ offline: bool = True,
111
+ ) -> SourceConditionedResult:
112
+ """Source-conditioned distance d(A, B | S): selection divergence + each document's distance to S."""
113
+ return DocDistance(backend=backend, offline=offline).distance_wrt_source(
114
+ a, b, source, anisotropy=anisotropy
115
+ )
docdistance/plots.py ADDED
@@ -0,0 +1,29 @@
1
+ from pathlib import Path
2
+
3
+ from loguru import logger
4
+ from tqdm import tqdm
5
+ import typer
6
+
7
+ from docdistance.config import FIGURES_DIR, PROCESSED_DATA_DIR
8
+
9
+ app = typer.Typer()
10
+
11
+
12
+ @app.command()
13
+ def main(
14
+ # ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
15
+ input_path: Path = PROCESSED_DATA_DIR / "dataset.csv",
16
+ output_path: Path = FIGURES_DIR / "plot.png",
17
+ # -----------------------------------------
18
+ ):
19
+ # ---- REPLACE THIS WITH YOUR OWN CODE ----
20
+ logger.info("Generating plot from data...")
21
+ for i in tqdm(range(10), total=10):
22
+ if i == 5:
23
+ logger.info("Something happened for iteration 5.")
24
+ logger.success("Plot generation complete.")
25
+ # -----------------------------------------
26
+
27
+
28
+ if __name__ == "__main__":
29
+ app()
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.4
2
+ Name: docdistance
3
+ Version: 1.0.15
4
+ Summary: Project that uses theory of From Word Embeddings To Document Distances / Optimal Transport to give meaningful distance from one document to another, useful if building agentic projects that convert or extract information from one document to another using frontier models but without the ability to calculate KL divergence from logits
5
+ Author: Stellars Henson <konrad.jelen+github@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/stellarshenson/docdistance
8
+ Project-URL: Repository, https://github.com/stellarshenson/docdistance
9
+ Project-URL: Issues, https://github.com/stellarshenson/docdistance/issues
10
+ Keywords: optimal-transport,word-movers-distance,statement-movers-distance,document-similarity,document-distance,embeddings,mmbert,nlp
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Classifier: Topic :: Text Processing :: Linguistic
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Science/Research
18
+ Requires-Python: ~=3.13.0
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: loguru
22
+ Requires-Dist: tqdm
23
+ Requires-Dist: typer
24
+ Requires-Dist: rich
25
+ Requires-Dist: python-dotenv
26
+ Requires-Dist: numpy
27
+ Requires-Dist: pot
28
+ Requires-Dist: transformers
29
+ Requires-Dist: wtpsplit
30
+ Requires-Dist: openvino
31
+ Requires-Dist: torch
32
+ Provides-Extra: dev
33
+ Requires-Dist: build; extra == "dev"
34
+ Requires-Dist: ipykernel; extra == "dev"
35
+ Requires-Dist: ipython; extra == "dev"
36
+ Requires-Dist: nbdime; extra == "dev"
37
+ Requires-Dist: pip; extra == "dev"
38
+ Requires-Dist: pytest; extra == "dev"
39
+ Requires-Dist: pytest-cov; extra == "dev"
40
+ Requires-Dist: ruff; extra == "dev"
41
+ Requires-Dist: twine; extra == "dev"
42
+ Requires-Dist: matplotlib; extra == "dev"
43
+ Requires-Dist: seaborn; extra == "dev"
44
+ Dynamic: license-file
45
+
46
+ # docdistance
47
+
48
+ Semantic distance between two documents via Statement Mover's Distance - optimal transport over mmBERT statement embeddings, after Kusner et al. 2015 (*From Word Embeddings To Document Distances*). A thin frontend to the library; the SOTA docs carry the mechanics, benchmarks, and validation.
49
+
50
+ - **Input** - two documents, raw text or a file path
51
+ - **Output** - an SMD distance, a 0..1 closeness, a verdict, and the statement alignment
52
+ - **Use** - agentic document conversion and extraction pipelines, where token logits are unavailable and KL divergence cannot be computed
53
+ - **Unit** - statement-level and position-invariant, with an interpretable transport plan
54
+
55
+ ## Theory
56
+
57
+ A document distance grounded in embeddings and optimal transport, not surface overlap.
58
+
59
+ - **WMD** - Word Mover's Distance (Kusner et al. 2015) casts document similarity as optimal transport between embedded tokens
60
+ - **SMD** - this project lifts it to statements: segment, embed, transport between the two statement clouds
61
+ - **Beyond cosine** - whole-document cosine collapses when the same claims sit in a different place or order; statement-level transport is position-invariant
62
+ - **Metric** - the ground cost `√(2 − 2cos)` on L2-normalized embeddings is a metric, so the document distance is one too
63
+ - **Logit-free** - an embedding-grounded alternative where token probabilities (KL divergence) are unavailable, as in frontier-model pipelines
64
+
65
+ ## Method
66
+
67
+ Three stages; the transport plan is the interpretable by-product.
68
+
69
+ 1. **Segment** - split each document into atomic statements with the SAT (Segment Any Text) segmenter
70
+ 2. **Embed** - encode each statement with the mmBERT contextual encoder (mean-pooled, L2-normalized)
71
+ 3. **Compare** - optimal transport between the two statement clouds (Statement Mover's Distance), optionally unbalanced so added or missing statements are scored, not force-matched
72
+
73
+ - **Closeness** - `1 − SMD/√2`, on a 0..1 scale
74
+ - **Source-conditioned** - a variant `d(A, B | S)` re-bases the transport onto a shared source `S` and reads off a selection axis and a grounding axis
75
+
76
+ ## Usage
77
+
78
+ The library is the product; install once, then call it.
79
+
80
+ ```python
81
+ from docdistance import document_distance
82
+
83
+ result = document_distance("report_v1.md", "report_v2.md")
84
+ print(result.closeness) # 0..1 similarity, 1 - SMD/sqrt(2)
85
+ print(result.verdict) # "similar" | "not similar"
86
+ ```
87
+
88
+ ```bash
89
+ make install # environment, package, Jupyter kernel
90
+ docdistance install # download + cache the models (once)
91
+ docdistance distance a.md b.md # rich, coloured verdict
92
+ docdistance distance a.md b.md --json # machine-readable JSON
93
+ ```
94
+
95
+ - **Offline after install** - distance calls run fully offline once the models are cached
96
+ - **Backend** - `--backend openvino|torch`, default `openvino` (CPU INT8)
97
+ - **Full API and flags** - `docdistance --help` and the SOTA docs
98
+
99
+ ## Documentation
100
+
101
+ The SOTA documents explain how it works in detail; this README only introduces it.
102
+
103
+ - `docs/wmd-docdistance-solution-sota.md` - source-free distance: design, mechanism, performance, validation
104
+ - `docs/wmd-wrt-source-docdistance-solution.md` - source-conditioned distance `d(A,B|S)`
105
+ - `docs/mmbert-quantization-solution.md` - the INT8 / FP8 statement encoder
106
+ - `references/papers/from-word-embeddings-to-document-distances.md` - WMD paper digest (Kusner et al. 2015)
107
+
108
+ > **Note**: Scaffolded with the [copier-data-science](https://github.com/stellarshenson/copier-data-science) template.
@@ -0,0 +1,18 @@
1
+ docdistance/__init__.py,sha256=7JPs0Q9AwNqWMFm8wcnrTWiP0SNnpAX2p0tA1NjT5gY,852
2
+ docdistance/cli.py,sha256=_bKjQf2gV-RpqJZ7d1RfwoUcZA18hk9bLvmCGcZz4Yg,8222
3
+ docdistance/config.py,sha256=5GZ7NYDn24Kg6ZBczNYlGiuyla_4_sOJ5zhuVx-6ZD4,2106
4
+ docdistance/dataset.py,sha256=huS5XJ_ydmV6rbtokI62GEgwQviUbe1HWHX5Hyutjmw,771
5
+ docdistance/distance.py,sha256=xklp-zY_uGa6EbTV1WYBVfs2XGx1Piy8eugf4eGO1h4,8415
6
+ docdistance/encoders.py,sha256=LND3YWJFRUP7XwN2N1npNrvfao25FzxtbpyFVXjQO_4,9315
7
+ docdistance/features.py,sha256=UU2MNtJ5gjcDm9j2QzQAQxFV6NAwnGKQIslUUreyKLU,774
8
+ docdistance/pipeline.py,sha256=lZisWFpzJqKUWYp80gSz05G8ZHjH1-JvKFd3WdX0SDs,3681
9
+ docdistance/plots.py,sha256=70V_HtIyDSlmoP5hW8Ub3tlm1Vi0X5wSeDZPINxzBJ4,765
10
+ docdistance/modeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ docdistance/modeling/predict.py,sha256=3B61xaFKG7GRXvwe3xhX5YmJa_M8P3iPWH3uGGcch5w,845
12
+ docdistance/modeling/train.py,sha256=OH6okBYDaUG8tLmEoyBTBHqZmJUvdVp22_hnIdNY0Bo,822
13
+ docdistance-1.0.15.dist-info/licenses/LICENSE,sha256=AtXSSglTQoyugtRKTcRY-XeGVUG83jnSek9Q3iM8rb8,1116
14
+ docdistance-1.0.15.dist-info/METADATA,sha256=uUxUlIpBThv-S6T6p5W3IvrKBNJpmwoAOcp-ZNIH23I,5561
15
+ docdistance-1.0.15.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
16
+ docdistance-1.0.15.dist-info/entry_points.txt,sha256=O3HurDhddDvy5pgGPEgac-y-FgDsBHkDPN7IIuJEKJ8,52
17
+ docdistance-1.0.15.dist-info/top_level.txt,sha256=BJP9ozRKdJaw9aUUmkAtLbrb-3QRc4oXgkSaZMGyJG8,12
18
+ docdistance-1.0.15.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ docdistance = docdistance.cli:app
@@ -0,0 +1,10 @@
1
+
2
+ The MIT License (MIT)
3
+ Copyright (c) 2026, Stellars Henson <konrad.jelen+github@gmail.com>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10
+
@@ -0,0 +1 @@
1
+ docdistance