docdistance 1.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docdistance/__init__.py +38 -0
- docdistance/cli.py +229 -0
- docdistance/config.py +66 -0
- docdistance/dataset.py +29 -0
- docdistance/distance.py +231 -0
- docdistance/encoders.py +229 -0
- docdistance/features.py +29 -0
- docdistance/modeling/__init__.py +0 -0
- docdistance/modeling/predict.py +30 -0
- docdistance/modeling/train.py +30 -0
- docdistance/pipeline.py +115 -0
- docdistance/plots.py +29 -0
- docdistance-1.0.15.dist-info/METADATA +108 -0
- docdistance-1.0.15.dist-info/RECORD +18 -0
- docdistance-1.0.15.dist-info/WHEEL +5 -0
- docdistance-1.0.15.dist-info/entry_points.txt +2 -0
- docdistance-1.0.15.dist-info/licenses/LICENSE +10 -0
- docdistance-1.0.15.dist-info/top_level.txt +1 -0
docdistance/__init__.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
2
|
+
|
|
3
|
+
from docdistance import config # noqa: F401 (sets up logging + paths on import)
|
|
4
|
+
from docdistance.distance import (
|
|
5
|
+
DistanceResult,
|
|
6
|
+
SourceConditionedResult,
|
|
7
|
+
closeness,
|
|
8
|
+
compute_distance,
|
|
9
|
+
compute_source_conditioned,
|
|
10
|
+
rwmd,
|
|
11
|
+
smd,
|
|
12
|
+
wcd,
|
|
13
|
+
)
|
|
14
|
+
from docdistance.pipeline import (
|
|
15
|
+
DocDistance,
|
|
16
|
+
document_distance,
|
|
17
|
+
source_conditioned_distance,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
__version__ = version("docdistance")
|
|
22
|
+
except PackageNotFoundError: # running from source, not installed
|
|
23
|
+
__version__ = "0.0.0"
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"DocDistance",
|
|
27
|
+
"DistanceResult",
|
|
28
|
+
"SourceConditionedResult",
|
|
29
|
+
"document_distance",
|
|
30
|
+
"source_conditioned_distance",
|
|
31
|
+
"compute_distance",
|
|
32
|
+
"compute_source_conditioned",
|
|
33
|
+
"smd",
|
|
34
|
+
"wcd",
|
|
35
|
+
"rwmd",
|
|
36
|
+
"closeness",
|
|
37
|
+
"__version__",
|
|
38
|
+
]
|
docdistance/cli.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""docdistance command-line interface.
|
|
2
|
+
|
|
3
|
+
Three subcommands - ``install`` (the only one that downloads models), ``distance`` (symmetric SMD)
|
|
4
|
+
and ``distance-wrt-source`` (source-conditioned). Human output is rich and coloured on a capable
|
|
5
|
+
terminal; ``--json`` emits machine-readable JSON and ``--result-only`` emits the bare result.
|
|
6
|
+
Logs go to stderr (loguru, ``--verbose`` for DEBUG), so stdout carries only the result.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from enum import Enum
|
|
12
|
+
import json
|
|
13
|
+
|
|
14
|
+
from rich.console import Console
|
|
15
|
+
from rich.panel import Panel
|
|
16
|
+
from rich.table import Table
|
|
17
|
+
import typer
|
|
18
|
+
|
|
19
|
+
from docdistance.config import configure_logging
|
|
20
|
+
from docdistance.distance import DEFAULT_THRESHOLD
|
|
21
|
+
|
|
22
|
+
app = typer.Typer(
|
|
23
|
+
rich_markup_mode="rich",
|
|
24
|
+
no_args_is_help=True,
|
|
25
|
+
add_completion=False,
|
|
26
|
+
help="[bold]docdistance[/bold] - semantic distance between documents via Statement Mover's Distance "
|
|
27
|
+
"(optimal transport over mmBERT statement embeddings).",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
_out = Console() # stdout, for the result
|
|
31
|
+
_err = Console(stderr=True) # stderr, for errors
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Backend(str, Enum):
|
|
35
|
+
openvino = "openvino"
|
|
36
|
+
torch = "torch"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class InstallBackend(str, Enum):
|
|
40
|
+
openvino = "openvino"
|
|
41
|
+
torch = "torch"
|
|
42
|
+
both = "both"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _version_cb(value: bool):
|
|
46
|
+
if value:
|
|
47
|
+
from docdistance import __version__
|
|
48
|
+
|
|
49
|
+
typer.echo(f"docdistance {__version__}")
|
|
50
|
+
raise typer.Exit()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@app.callback()
|
|
54
|
+
def main(
|
|
55
|
+
version: bool = typer.Option(
|
|
56
|
+
False, "--version", callback=_version_cb, is_eager=True, help="show version and exit"
|
|
57
|
+
),
|
|
58
|
+
):
|
|
59
|
+
"""Semantic document distance grounded in optimal-transport theory."""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _run(fn):
|
|
63
|
+
"""Call ``fn`` and turn a missing-model error into a clean message + exit code 1."""
|
|
64
|
+
from docdistance.encoders import ModelsNotInstalled
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
return fn()
|
|
68
|
+
except ModelsNotInstalled as exc:
|
|
69
|
+
_err.print(f"[bold red]error:[/bold red] {exc}")
|
|
70
|
+
raise typer.Exit(1)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _emit_distance(r, json_out: bool, result_only: bool) -> None:
|
|
74
|
+
if result_only:
|
|
75
|
+
typer.echo(str(r.smd))
|
|
76
|
+
return
|
|
77
|
+
if json_out:
|
|
78
|
+
typer.echo(json.dumps(r.to_dict(), indent=2))
|
|
79
|
+
return
|
|
80
|
+
color = "green" if r.verdict == "similar" else "red"
|
|
81
|
+
grid = Table.grid(padding=(0, 2))
|
|
82
|
+
grid.add_column(style="bold cyan")
|
|
83
|
+
grid.add_column()
|
|
84
|
+
grid.add_row("SMD (distance)", f"{r.smd:.4f}")
|
|
85
|
+
grid.add_row("closeness", f"{r.closeness * 100:.1f}%")
|
|
86
|
+
grid.add_row(
|
|
87
|
+
"verdict", f"[{color}]{r.verdict}[/{color}] (threshold {r.threshold:.2f} closeness)"
|
|
88
|
+
)
|
|
89
|
+
grid.add_row("bounds", f"WCD {r.wcd:.4f} ≤ RWMD {r.rwmd:.4f} ≤ SMD {r.smd:.4f}")
|
|
90
|
+
grid.add_row("statements", f"{r.n_statements_a} vs {r.n_statements_b}")
|
|
91
|
+
grid.add_row("anisotropy", "on" if r.anisotropy else "off")
|
|
92
|
+
_out.print(
|
|
93
|
+
Panel(grid, title="[bold]Document distance[/bold]", border_style=color, expand=False)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _emit_wrt_source(r, json_out: bool, result_only: bool) -> None:
|
|
98
|
+
if result_only:
|
|
99
|
+
typer.echo(f"{r.d_sel},{r.residual_a},{r.residual_b}")
|
|
100
|
+
return
|
|
101
|
+
if json_out:
|
|
102
|
+
typer.echo(json.dumps(r.to_dict(), indent=2))
|
|
103
|
+
return
|
|
104
|
+
grid = Table.grid(padding=(0, 2))
|
|
105
|
+
grid.add_column(style="bold cyan")
|
|
106
|
+
grid.add_column()
|
|
107
|
+
grid.add_row("D_sel (selection divergence)", f"{r.d_sel:.4f}")
|
|
108
|
+
grid.add_row("A → source", f"{r.residual_a:.4f} (closeness {r.closeness_a * 100:.1f}%)")
|
|
109
|
+
grid.add_row("B → source", f"{r.residual_b:.4f} (closeness {r.closeness_b * 100:.1f}%)")
|
|
110
|
+
grid.add_row(
|
|
111
|
+
"statements", f"A {r.n_statements_a} / B {r.n_statements_b} / S {r.n_statements_source}"
|
|
112
|
+
)
|
|
113
|
+
_out.print(
|
|
114
|
+
Panel(
|
|
115
|
+
grid,
|
|
116
|
+
title="[bold]Source-conditioned distance d(A,B|S)[/bold]",
|
|
117
|
+
border_style="cyan",
|
|
118
|
+
expand=False,
|
|
119
|
+
)
|
|
120
|
+
)
|
|
121
|
+
_out.print(
|
|
122
|
+
"[dim]residual = geometric distance to the source; the reranker + NLI grounding grade and "
|
|
123
|
+
"numeric verifier are deferred to E02[/dim]"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@app.command(
|
|
128
|
+
epilog="[bold]Examples[/bold]\n\n"
|
|
129
|
+
" docdistance distance report_v1.md report_v2.md\n"
|
|
130
|
+
' docdistance distance "first text" "second text" --backend torch\n'
|
|
131
|
+
" docdistance distance a.md b.md --json\n"
|
|
132
|
+
" docdistance distance a.md b.md --result-only"
|
|
133
|
+
)
|
|
134
|
+
def distance(
|
|
135
|
+
a: str = typer.Argument(..., help="first document - a file path or raw text"),
|
|
136
|
+
b: str = typer.Argument(..., help="second document - a file path or raw text"),
|
|
137
|
+
backend: Backend = typer.Option(
|
|
138
|
+
Backend.openvino, "--backend", help="statement encoder backend"
|
|
139
|
+
),
|
|
140
|
+
anisotropy: bool = typer.Option(
|
|
141
|
+
False,
|
|
142
|
+
"--anisotropy/--no-anisotropy",
|
|
143
|
+
help="all-but-the-top anisotropy removal - needs a corpus, off by default for a pair",
|
|
144
|
+
),
|
|
145
|
+
threshold: float = typer.Option(
|
|
146
|
+
DEFAULT_THRESHOLD,
|
|
147
|
+
"--threshold",
|
|
148
|
+
help="closeness cutoff for the similar / not-similar verdict",
|
|
149
|
+
),
|
|
150
|
+
json_out: bool = typer.Option(False, "--json", help="machine-readable JSON to stdout"),
|
|
151
|
+
result_only: bool = typer.Option(
|
|
152
|
+
False, "--result-only", help="bare SMD scalar to stdout, no clutter"
|
|
153
|
+
),
|
|
154
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="DEBUG logging to stderr"),
|
|
155
|
+
):
|
|
156
|
+
"""Symmetric distance between two documents - the exact Statement Mover's Distance."""
|
|
157
|
+
configure_logging(verbose)
|
|
158
|
+
from docdistance.pipeline import document_distance
|
|
159
|
+
|
|
160
|
+
result = _run(
|
|
161
|
+
lambda: document_distance(
|
|
162
|
+
a, b, backend=backend.value, anisotropy=anisotropy, threshold=threshold
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
_emit_distance(result, json_out, result_only)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@app.command(
|
|
169
|
+
name="distance-wrt-source",
|
|
170
|
+
epilog="[bold]Examples[/bold]\n\n"
|
|
171
|
+
" docdistance distance-wrt-source summary_a.md summary_b.md --source article.md\n"
|
|
172
|
+
" docdistance distance-wrt-source a.md b.md -s s.md --json\n"
|
|
173
|
+
" docdistance distance-wrt-source a.md b.md -s s.md --result-only [dim]# D_sel,res_a,res_b[/dim]",
|
|
174
|
+
)
|
|
175
|
+
def distance_wrt_source(
|
|
176
|
+
a: str = typer.Argument(..., help="first document - a file path or raw text"),
|
|
177
|
+
b: str = typer.Argument(..., help="second document - a file path or raw text"),
|
|
178
|
+
source: str = typer.Option(..., "--source", "-s", help="the common source document"),
|
|
179
|
+
backend: Backend = typer.Option(
|
|
180
|
+
Backend.openvino, "--backend", help="statement encoder backend"
|
|
181
|
+
),
|
|
182
|
+
anisotropy: bool = typer.Option(
|
|
183
|
+
False,
|
|
184
|
+
"--anisotropy/--no-anisotropy",
|
|
185
|
+
help="anisotropy removal - needs a corpus, off by default",
|
|
186
|
+
),
|
|
187
|
+
json_out: bool = typer.Option(False, "--json", help="machine-readable JSON to stdout"),
|
|
188
|
+
result_only: bool = typer.Option(
|
|
189
|
+
False, "--result-only", help="bare comma-separated D_sel,residual_a,residual_b to stdout"
|
|
190
|
+
),
|
|
191
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="DEBUG logging to stderr"),
|
|
192
|
+
):
|
|
193
|
+
"""Source-conditioned distance d(A, B | S) - selection divergence plus each document's distance to S."""
|
|
194
|
+
configure_logging(verbose)
|
|
195
|
+
from docdistance.pipeline import source_conditioned_distance
|
|
196
|
+
|
|
197
|
+
result = _run(
|
|
198
|
+
lambda: source_conditioned_distance(
|
|
199
|
+
a, b, source, backend=backend.value, anisotropy=anisotropy
|
|
200
|
+
)
|
|
201
|
+
)
|
|
202
|
+
_emit_wrt_source(result, json_out, result_only)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@app.command(
|
|
206
|
+
epilog="[bold]Examples[/bold]\n\n"
|
|
207
|
+
" docdistance install [dim]# both backends[/dim]\n"
|
|
208
|
+
" docdistance install --backend openvino",
|
|
209
|
+
)
|
|
210
|
+
def install(
|
|
211
|
+
backend: InstallBackend = typer.Option(
|
|
212
|
+
InstallBackend.both, "--backend", help="which encoder weights to fetch"
|
|
213
|
+
),
|
|
214
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="DEBUG logging to stderr"),
|
|
215
|
+
):
|
|
216
|
+
"""Download and cache the models - the only command that fetches from the Hub (TQDM progress bars)."""
|
|
217
|
+
configure_logging(verbose)
|
|
218
|
+
from docdistance.encoders import ModelsNotInstalled, download_models
|
|
219
|
+
|
|
220
|
+
try:
|
|
221
|
+
backends = download_models(backend.value)
|
|
222
|
+
except ModelsNotInstalled as exc:
|
|
223
|
+
_err.print(f"[bold red]error:[/bold red] {exc}")
|
|
224
|
+
raise typer.Exit(1)
|
|
225
|
+
_out.print(f"[green]models ready:[/green] {', '.join(backends)}")
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
if __name__ == "__main__":
|
|
229
|
+
app()
|
docdistance/config.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
from loguru import logger
|
|
6
|
+
|
|
7
|
+
########### SETUP ###############
|
|
8
|
+
|
|
9
|
+
# set up logger - INFO by default (DEBUG only via the CLI --verbose flag), sink to stderr so
|
|
10
|
+
# stdout stays clean for --json / --result-only output
|
|
11
|
+
logger.remove()
|
|
12
|
+
logger.add(sys.stderr, colorize=True, level="INFO")
|
|
13
|
+
|
|
14
|
+
# If tqdm is installed, configure loguru with tqdm.write
|
|
15
|
+
# https://github.com/Delgan/loguru/issues/135
|
|
16
|
+
try:
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
|
|
19
|
+
logger.remove()
|
|
20
|
+
logger.add(lambda msg: tqdm.write(msg, end="", file=sys.stderr), colorize=True, level="INFO")
|
|
21
|
+
except ModuleNotFoundError:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
########## VARIABLES ############
|
|
25
|
+
|
|
26
|
+
# Load environment variables from .env file if it exists
|
|
27
|
+
load_dotenv()
|
|
28
|
+
|
|
29
|
+
# paths
|
|
30
|
+
PROJ_ROOT = Path(__file__).resolve().parents[2]
|
|
31
|
+
DATA_DIR = PROJ_ROOT / "data"
|
|
32
|
+
RAW_DATA_DIR = DATA_DIR / "raw"
|
|
33
|
+
INTERIM_DATA_DIR = DATA_DIR / "interim"
|
|
34
|
+
PROCESSED_DATA_DIR = DATA_DIR / "processed"
|
|
35
|
+
EXTERNAL_DATA_DIR = DATA_DIR / "external"
|
|
36
|
+
MODELS_DIR = PROJ_ROOT / "models"
|
|
37
|
+
REPORTS_DIR = PROJ_ROOT / "reports"
|
|
38
|
+
FIGURES_DIR = REPORTS_DIR / "figures"
|
|
39
|
+
|
|
40
|
+
# log current root dir (debug so it never pollutes machine-readable stdout)
|
|
41
|
+
logger.debug(f"PROJ_ROOT path is: {PROJ_ROOT}")
|
|
42
|
+
|
|
43
|
+
########## MODELS ###############
|
|
44
|
+
|
|
45
|
+
# segmenter (wtpsplit SaT) and the mmBERT statement encoders, by backend
|
|
46
|
+
SAT_MODEL = "sat-3l-sm"
|
|
47
|
+
MMBERT_TORCH_MODEL = "jhu-clsp/mmBERT-base"
|
|
48
|
+
MMBERT_OPENVINO_LOCAL = MODELS_DIR / "02-mmbert-openvino-int8"
|
|
49
|
+
MMBERT_OPENVINO_HF = "stellars/mmBERT-base-openvino-int8"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def configure_logging(verbose: bool = False) -> None:
|
|
53
|
+
"""Re-point loguru at stderr at INFO, or DEBUG when ``verbose`` - the CLI calls this first.
|
|
54
|
+
|
|
55
|
+
stderr keeps stdout reserved for the result so ``--json`` and ``--result-only`` stay machine-parseable.
|
|
56
|
+
"""
|
|
57
|
+
level = "DEBUG" if verbose else "INFO"
|
|
58
|
+
logger.remove()
|
|
59
|
+
try:
|
|
60
|
+
from tqdm import tqdm
|
|
61
|
+
|
|
62
|
+
logger.add(
|
|
63
|
+
lambda msg: tqdm.write(msg, end="", file=sys.stderr), colorize=True, level=level
|
|
64
|
+
)
|
|
65
|
+
except ModuleNotFoundError:
|
|
66
|
+
logger.add(sys.stderr, colorize=True, level=level)
|
docdistance/dataset.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from loguru import logger
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
import typer
|
|
6
|
+
|
|
7
|
+
from docdistance.config import PROCESSED_DATA_DIR, RAW_DATA_DIR
|
|
8
|
+
|
|
9
|
+
app = typer.Typer()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@app.command()
|
|
13
|
+
def main(
|
|
14
|
+
# ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
|
|
15
|
+
input_path: Path = RAW_DATA_DIR / "dataset.csv",
|
|
16
|
+
output_path: Path = PROCESSED_DATA_DIR / "dataset.csv",
|
|
17
|
+
# ----------------------------------------------
|
|
18
|
+
):
|
|
19
|
+
# ---- REPLACE THIS WITH YOUR OWN CODE ----
|
|
20
|
+
logger.info("Processing dataset...")
|
|
21
|
+
for i in tqdm(range(10), total=10):
|
|
22
|
+
if i == 5:
|
|
23
|
+
logger.info("Something happened for iteration 5.")
|
|
24
|
+
logger.success("Processing dataset complete.")
|
|
25
|
+
# -----------------------------------------
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
app()
|
docdistance/distance.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""Pure-numpy optimal-transport core for document distance.
|
|
2
|
+
|
|
3
|
+
No heavy ML dependencies - only numpy and POT (``ot``). Segmentation and embedding live in
|
|
4
|
+
``encoders.py``; this module operates on statement-embedding arrays: L2-normalized float32 of
|
|
5
|
+
shape ``[n_statements, dim]``. Every function here is deterministic and CPU-only, which is why
|
|
6
|
+
the unit tests can exercise it without loading a single model.
|
|
7
|
+
|
|
8
|
+
The distance is the exact Statement Mover's Distance (SMD) - optimal transport between two
|
|
9
|
+
statement clouds with the metric ground cost ``sqrt(2 - 2cos)`` (Euclidean on L2-normalized
|
|
10
|
+
vectors). ``wcd`` and ``rwmd`` are the cheap lower bounds (``WCD <= RWMD <= SMD``). The
|
|
11
|
+
source-conditioned helpers re-base the transport onto a common source ``S``.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from dataclasses import asdict, dataclass
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
import ot
|
|
20
|
+
|
|
21
|
+
# orthogonal statement clouds -> closeness 0; cos >= 0 for these embeddings so distance lands in [0, sqrt(2)]
|
|
22
|
+
SMD_MAX = float(np.sqrt(2.0))
|
|
23
|
+
|
|
24
|
+
# closeness cutoff for the similar / not-similar verdict; heuristic, calibrate per corpus
|
|
25
|
+
# (measured boundary on the ibm-ai-adoption fixtures: min gold 72.7% vs max adversarial 72.2%)
|
|
26
|
+
DEFAULT_THRESHOLD = 0.725
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def cost_matrix(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
|
|
30
|
+
"""Ground cost ``sqrt(2 - 2cos)`` = Euclidean distance on L2-normalized rows (a metric)."""
|
|
31
|
+
return ot.dist(X, Y, metric="euclidean")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _uniform(n: int) -> np.ndarray:
|
|
35
|
+
return np.full(n, 1.0 / n)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _ab(X: np.ndarray, Y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
|
39
|
+
return _uniform(len(X)), _uniform(len(Y))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def smd(X: np.ndarray, Y: np.ndarray) -> float:
|
|
43
|
+
"""The distance: exact Statement Mover's Distance via the network-simplex LP."""
|
|
44
|
+
return float(ot.emd2(*_ab(X, Y), cost_matrix(X, Y)))
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def wcd(X: np.ndarray, Y: np.ndarray) -> float:
|
|
48
|
+
"""Lower bound: distance between the mean-pooled statement clouds (whole-doc cosine)."""
|
|
49
|
+
return float(np.linalg.norm(X.mean(0) - Y.mean(0)))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def rwmd(X: np.ndarray, Y: np.ndarray) -> float:
|
|
53
|
+
"""Lower bound: one-sided relaxation (greedy nearest-statement alignment)."""
|
|
54
|
+
a, b = _ab(X, Y)
|
|
55
|
+
C = cost_matrix(X, Y)
|
|
56
|
+
return float(max((a * C.min(1)).sum(), (b * C.min(0)).sum()))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def closeness(d: float) -> float:
|
|
60
|
+
"""Map a distance to a 0-1 similarity: 1 = identical clouds, 0 = orthogonal."""
|
|
61
|
+
return max(0.0, 1.0 - d / SMD_MAX)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def verdict(close: float, threshold: float = DEFAULT_THRESHOLD) -> str:
|
|
65
|
+
return "similar" if close >= threshold else "not similar"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def all_but_the_top(emb: dict[str, np.ndarray], k: int = 1) -> dict[str, np.ndarray]:
|
|
69
|
+
"""All-but-the-top anisotropy removal (Mu & Viswanath, ICLR 2018).
|
|
70
|
+
|
|
71
|
+
Subtract the pooled mean and project out the top-``k`` principal components (via SVD of the
|
|
72
|
+
mean-centered matrix), then re-L2-normalize. De-bunches the anisotropic mmBERT cosines and
|
|
73
|
+
widens the distance dynamic range while preserving statement ordering. Operates over the
|
|
74
|
+
pooled statements of all documents in ``emb`` so the common direction is shared.
|
|
75
|
+
"""
|
|
76
|
+
keys = list(emb)
|
|
77
|
+
pool = np.concatenate([emb[key] for key in keys], 0)
|
|
78
|
+
centered = pool - pool.mean(0)
|
|
79
|
+
_, _, Vt = np.linalg.svd(centered, full_matrices=False)
|
|
80
|
+
comps = Vt[:k]
|
|
81
|
+
fixed = centered - centered @ comps.T @ comps
|
|
82
|
+
fixed = fixed / (np.linalg.norm(fixed, axis=1, keepdims=True) + 1e-9)
|
|
83
|
+
out: dict[str, np.ndarray] = {}
|
|
84
|
+
off = 0
|
|
85
|
+
for key in keys:
|
|
86
|
+
m = len(emb[key])
|
|
87
|
+
out[key] = fixed[off : off + m].astype(np.float32)
|
|
88
|
+
off += m
|
|
89
|
+
return out
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# --- source-conditioned core (metric parts only; reranker/NLI grounding deferred to E02) ---
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
COVERAGE_TEMPERATURE = 0.1
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def coverage_profile(
|
|
99
|
+
X: np.ndarray, S: np.ndarray, temperature: float = COVERAGE_TEMPERATURE
|
|
100
|
+
) -> np.ndarray:
|
|
101
|
+
"""How document ``X``'s statements distribute over the source statements ``S``; a distribution, sums to 1.
|
|
102
|
+
|
|
103
|
+
Each statement softly assigns to source statements by ``softmax(-cost / temperature)`` and the
|
|
104
|
+
profile is the mean assignment over ``X``. A balanced-OT column marginal is forced uniform (the
|
|
105
|
+
transport constraint), so it carries no per-document signal; this soft nearest-source histogram
|
|
106
|
+
varies by document and captures which source content each one covers.
|
|
107
|
+
"""
|
|
108
|
+
C = cost_matrix(X, S)
|
|
109
|
+
A = np.exp(
|
|
110
|
+
-(C - C.min(1, keepdims=True)) / temperature
|
|
111
|
+
) # subtract row-min for numerical stability
|
|
112
|
+
A = A / A.sum(1, keepdims=True)
|
|
113
|
+
return A.mean(0)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def selection_divergence(cov_a: np.ndarray, cov_b: np.ndarray, S: np.ndarray) -> float:
|
|
117
|
+
"""D_sel: metric OT between two coverage profiles over the shared source statements.
|
|
118
|
+
|
|
119
|
+
Ground cost is ``sqrt(2 - 2cos)`` on the source-statement embeddings, so D_sel stays a metric -
|
|
120
|
+
the guarantee conditioning on a fixed ``S`` buys back. Captures same source, different picks.
|
|
121
|
+
"""
|
|
122
|
+
return float(ot.emd2(np.asarray(cov_a), np.asarray(cov_b), cost_matrix(S, S)))
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def ungrounded_residual(X: np.ndarray, S: np.ndarray) -> float:
|
|
126
|
+
"""Per-document grounding proxy: the transport cost SMD(X, S) (distance of X to the source).
|
|
127
|
+
|
|
128
|
+
A coarse, metric stand-in for the grounding residual - higher means the document drifts
|
|
129
|
+
further from any source statement. The reranker + NLI grade that separates contradiction from
|
|
130
|
+
omission is deferred to E02; this is the geometric distance-to-source only.
|
|
131
|
+
"""
|
|
132
|
+
return smd(X, S)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@dataclass
|
|
136
|
+
class DistanceResult:
|
|
137
|
+
"""Symmetric distance result. ``smd`` is the distance; ``wcd``/``rwmd`` bound it below."""
|
|
138
|
+
|
|
139
|
+
smd: float
|
|
140
|
+
wcd: float
|
|
141
|
+
rwmd: float
|
|
142
|
+
closeness: float
|
|
143
|
+
threshold: float
|
|
144
|
+
verdict: str
|
|
145
|
+
anisotropy: bool
|
|
146
|
+
n_statements_a: int
|
|
147
|
+
n_statements_b: int
|
|
148
|
+
|
|
149
|
+
def to_dict(self) -> dict:
|
|
150
|
+
return asdict(self)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@dataclass
|
|
154
|
+
class SourceConditionedResult:
|
|
155
|
+
"""Source-conditioned result: selection divergence plus each document's distance to the source."""
|
|
156
|
+
|
|
157
|
+
d_sel: float
|
|
158
|
+
residual_a: float
|
|
159
|
+
residual_b: float
|
|
160
|
+
closeness_a: float
|
|
161
|
+
closeness_b: float
|
|
162
|
+
n_statements_a: int
|
|
163
|
+
n_statements_b: int
|
|
164
|
+
n_statements_source: int
|
|
165
|
+
coverage_a: list[float]
|
|
166
|
+
coverage_b: list[float]
|
|
167
|
+
|
|
168
|
+
def to_dict(self) -> dict:
|
|
169
|
+
return asdict(self)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def compute_distance(
|
|
173
|
+
emb_a: np.ndarray,
|
|
174
|
+
emb_b: np.ndarray,
|
|
175
|
+
*,
|
|
176
|
+
anisotropy: bool = False,
|
|
177
|
+
threshold: float = DEFAULT_THRESHOLD,
|
|
178
|
+
) -> DistanceResult:
|
|
179
|
+
"""Assemble a :class:`DistanceResult` from two statement-embedding arrays.
|
|
180
|
+
|
|
181
|
+
``anisotropy`` is off by default: all-but-the-top estimates the shared direction from a corpus,
|
|
182
|
+
so over a single document pair it strips genuine shared meaning and distorts the scale. The
|
|
183
|
+
validated nb04 verdict uses raw embeddings; enable anisotropy only over a pooled corpus.
|
|
184
|
+
"""
|
|
185
|
+
n_a, n_b = len(emb_a), len(emb_b)
|
|
186
|
+
if anisotropy:
|
|
187
|
+
fixed = all_but_the_top({"a": emb_a, "b": emb_b}, k=1)
|
|
188
|
+
emb_a, emb_b = fixed["a"], fixed["b"]
|
|
189
|
+
d = smd(emb_a, emb_b)
|
|
190
|
+
close = closeness(d)
|
|
191
|
+
return DistanceResult(
|
|
192
|
+
smd=d,
|
|
193
|
+
wcd=wcd(emb_a, emb_b),
|
|
194
|
+
rwmd=rwmd(emb_a, emb_b),
|
|
195
|
+
closeness=close,
|
|
196
|
+
threshold=threshold,
|
|
197
|
+
verdict=verdict(close, threshold),
|
|
198
|
+
anisotropy=anisotropy,
|
|
199
|
+
n_statements_a=n_a,
|
|
200
|
+
n_statements_b=n_b,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def compute_source_conditioned(
|
|
205
|
+
emb_a: np.ndarray,
|
|
206
|
+
emb_b: np.ndarray,
|
|
207
|
+
emb_source: np.ndarray,
|
|
208
|
+
*,
|
|
209
|
+
anisotropy: bool = False,
|
|
210
|
+
) -> SourceConditionedResult:
|
|
211
|
+
"""Assemble a :class:`SourceConditionedResult` from A, B and source statement embeddings."""
|
|
212
|
+
n_a, n_b, n_s = len(emb_a), len(emb_b), len(emb_source)
|
|
213
|
+
if anisotropy:
|
|
214
|
+
fixed = all_but_the_top({"a": emb_a, "b": emb_b, "s": emb_source}, k=1)
|
|
215
|
+
emb_a, emb_b, emb_source = fixed["a"], fixed["b"], fixed["s"]
|
|
216
|
+
cov_a = coverage_profile(emb_a, emb_source)
|
|
217
|
+
cov_b = coverage_profile(emb_b, emb_source)
|
|
218
|
+
res_a = ungrounded_residual(emb_a, emb_source)
|
|
219
|
+
res_b = ungrounded_residual(emb_b, emb_source)
|
|
220
|
+
return SourceConditionedResult(
|
|
221
|
+
d_sel=selection_divergence(cov_a, cov_b, emb_source),
|
|
222
|
+
residual_a=res_a,
|
|
223
|
+
residual_b=res_b,
|
|
224
|
+
closeness_a=closeness(res_a),
|
|
225
|
+
closeness_b=closeness(res_b),
|
|
226
|
+
n_statements_a=n_a,
|
|
227
|
+
n_statements_b=n_b,
|
|
228
|
+
n_statements_source=n_s,
|
|
229
|
+
coverage_a=cov_a.tolist(),
|
|
230
|
+
coverage_b=cov_b.tolist(),
|
|
231
|
+
)
|
docdistance/encoders.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""Model-backed segmentation and embedding.
|
|
2
|
+
|
|
3
|
+
The heavy dependencies (torch, transformers, openvino, wtpsplit, huggingface_hub) are imported
|
|
4
|
+
lazily inside the functions, so the pure-numpy :mod:`docdistance.distance` core stays
|
|
5
|
+
importable - and unit-testable - without them.
|
|
6
|
+
|
|
7
|
+
Inference never downloads. The constructors set ``HF_HUB_OFFLINE=1``; a model missing from the
|
|
8
|
+
cache raises :class:`ModelsNotInstalled` pointing at ``docdistance install``. Downloading happens
|
|
9
|
+
only in :func:`download_models`, which the ``install`` CLI command calls (TQDM progress bars come
|
|
10
|
+
from huggingface_hub).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import contextlib
|
|
16
|
+
import io
|
|
17
|
+
import os
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
from loguru import logger
|
|
21
|
+
import numpy as np
|
|
22
|
+
|
|
23
|
+
from docdistance import config
|
|
24
|
+
|
|
25
|
+
# keep transformers quiet before it is ever imported - it otherwise prints a model LOAD REPORT
|
|
26
|
+
# and advisory warnings (e.g. dropped LM-head keys) that leak past stderr redirection
|
|
27
|
+
os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
|
|
28
|
+
os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")
|
|
29
|
+
|
|
30
|
+
EMBED_BATCH = 64
|
|
31
|
+
MAX_TOKENS = 128
|
|
32
|
+
|
|
33
|
+
_INSTALL_HINT = "model not found in cache - run: docdistance install"
|
|
34
|
+
_EXTRA_HINT = "model dependencies missing - reinstall: pip install --force-reinstall docdistance"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ModelsNotInstalled(RuntimeError):
|
|
38
|
+
"""A required model is missing from the cache - run ``docdistance install``."""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _require_models_extra() -> None:
|
|
42
|
+
try:
|
|
43
|
+
import transformers
|
|
44
|
+
import wtpsplit # noqa: F401
|
|
45
|
+
except ModuleNotFoundError as exc:
|
|
46
|
+
raise ModelsNotInstalled(_EXTRA_HINT) from exc
|
|
47
|
+
# belt-and-suspenders alongside the env vars: silence the LOAD REPORT / modeling logger
|
|
48
|
+
import logging as _logging
|
|
49
|
+
|
|
50
|
+
transformers.logging.set_verbosity_error()
|
|
51
|
+
_logging.getLogger("transformers.modeling_utils").setLevel(_logging.ERROR)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _set_hf_token() -> None:
|
|
55
|
+
"""Map the project's vault token (HF_AUTH_TOKEN, loaded from .env by config) to HF_TOKEN."""
|
|
56
|
+
if os.environ.get("HF_AUTH_TOKEN") and not os.environ.get("HF_TOKEN"):
|
|
57
|
+
os.environ["HF_TOKEN"] = os.environ["HF_AUTH_TOKEN"]
|
|
58
|
+
os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
|
|
59
|
+
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class Segmenter:
|
|
63
|
+
"""SAT statement segmenter (wtpsplit ``sat-3l-sm``), CPU."""
|
|
64
|
+
|
|
65
|
+
def __init__(self, offline: bool = True):
|
|
66
|
+
_require_models_extra()
|
|
67
|
+
_set_hf_token()
|
|
68
|
+
if offline:
|
|
69
|
+
os.environ.setdefault("HF_HUB_OFFLINE", "1")
|
|
70
|
+
with contextlib.redirect_stderr(io.StringIO()):
|
|
71
|
+
from wtpsplit import SaT
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
self._sat = SaT(config.SAT_MODEL)
|
|
75
|
+
except Exception as exc: # missing weights under offline mode
|
|
76
|
+
raise ModelsNotInstalled(_INSTALL_HINT) from exc
|
|
77
|
+
logger.debug("loaded SAT segmenter '{}'", config.SAT_MODEL)
|
|
78
|
+
|
|
79
|
+
def split(self, text: str) -> list[str]:
|
|
80
|
+
with contextlib.redirect_stderr(io.StringIO()):
|
|
81
|
+
return [s.strip() for s in self._sat.split(text) if s.strip()]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class OpenVINOEncoder:
|
|
85
|
+
"""mmBERT INT8 OpenVINO encoder (CPU). Mean-pooled, L2-normalized statement embeddings."""
|
|
86
|
+
|
|
87
|
+
name = "openvino"
|
|
88
|
+
|
|
89
|
+
def __init__(self, offline: bool = True):
|
|
90
|
+
_require_models_extra()
|
|
91
|
+
_set_hf_token()
|
|
92
|
+
import openvino as ov
|
|
93
|
+
from transformers import AutoTokenizer
|
|
94
|
+
|
|
95
|
+
src = config.MMBERT_OPENVINO_LOCAL
|
|
96
|
+
if not (src / "openvino_model.xml").exists():
|
|
97
|
+
if offline:
|
|
98
|
+
os.environ.setdefault("HF_HUB_OFFLINE", "1")
|
|
99
|
+
from huggingface_hub import snapshot_download
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
src = Path(snapshot_download(config.MMBERT_OPENVINO_HF))
|
|
103
|
+
except Exception as exc:
|
|
104
|
+
raise ModelsNotInstalled(_INSTALL_HINT) from exc
|
|
105
|
+
core = ov.Core()
|
|
106
|
+
model = core.read_model(str(src / "openvino_model.xml"))
|
|
107
|
+
# 2nd input name is dropped to '74' (attention_mask) during conversion - feed positionally
|
|
108
|
+
self._innames = [i.get_any_name() for i in model.inputs]
|
|
109
|
+
self._cm = core.compile_model(model, "CPU", {"PERFORMANCE_HINT": "LATENCY"})
|
|
110
|
+
self._tok = AutoTokenizer.from_pretrained(str(src))
|
|
111
|
+
logger.debug("loaded OpenVINO INT8 encoder from {}", src)
|
|
112
|
+
|
|
113
|
+
def encode(self, sents: list[str]) -> np.ndarray:
|
|
114
|
+
out = []
|
|
115
|
+
for i in range(0, len(sents), EMBED_BATCH):
|
|
116
|
+
batch = sents[i : i + EMBED_BATCH]
|
|
117
|
+
enc = self._tok(
|
|
118
|
+
batch, padding=True, truncation=True, max_length=MAX_TOKENS, return_tensors="np"
|
|
119
|
+
)
|
|
120
|
+
feeds = {self._innames[0]: enc["input_ids"], self._innames[1]: enc["attention_mask"]}
|
|
121
|
+
hidden = self._cm(feeds)[self._cm.output(0)]
|
|
122
|
+
mask = enc["attention_mask"][..., None].astype("float32")
|
|
123
|
+
pooled = (hidden * mask).sum(1) / np.clip(mask.sum(1), 1, None)
|
|
124
|
+
out.append(
|
|
125
|
+
(pooled / (np.linalg.norm(pooled, axis=1, keepdims=True) + 1e-9)).astype(
|
|
126
|
+
np.float32
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
return np.concatenate(out, 0)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class TorchEncoder:
|
|
133
|
+
"""mmBERT PyTorch encoder (GPU bf16 if available, else CPU fp32)."""
|
|
134
|
+
|
|
135
|
+
name = "torch"
|
|
136
|
+
|
|
137
|
+
def __init__(self, offline: bool = True, device: str | None = None):
|
|
138
|
+
_require_models_extra()
|
|
139
|
+
_set_hf_token()
|
|
140
|
+
if offline:
|
|
141
|
+
os.environ.setdefault("HF_HUB_OFFLINE", "1")
|
|
142
|
+
import torch
|
|
143
|
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
|
144
|
+
|
|
145
|
+
self._torch = torch
|
|
146
|
+
self._dev = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
|
147
|
+
with contextlib.redirect_stderr(io.StringIO()):
|
|
148
|
+
conf = AutoConfig.from_pretrained(config.MMBERT_TORCH_MODEL)
|
|
149
|
+
conf.reference_compile = False # avoid the ModernBERT first-forward torch.compile hang
|
|
150
|
+
try:
|
|
151
|
+
self._tok = AutoTokenizer.from_pretrained(config.MMBERT_TORCH_MODEL)
|
|
152
|
+
enc = AutoModel.from_pretrained(
|
|
153
|
+
config.MMBERT_TORCH_MODEL, config=conf, attn_implementation="eager"
|
|
154
|
+
)
|
|
155
|
+
except Exception as exc:
|
|
156
|
+
raise ModelsNotInstalled(_INSTALL_HINT) from exc
|
|
157
|
+
dtype = torch.bfloat16 if self._dev == "cuda" else torch.float32
|
|
158
|
+
self._enc = enc.to(self._dev).to(dtype).eval()
|
|
159
|
+
logger.debug("loaded Torch encoder '{}' on {}", config.MMBERT_TORCH_MODEL, self._dev)
|
|
160
|
+
|
|
161
|
+
def encode(self, sents: list[str]) -> np.ndarray:
|
|
162
|
+
torch = self._torch
|
|
163
|
+
out = []
|
|
164
|
+
with torch.no_grad():
|
|
165
|
+
for i in range(0, len(sents), EMBED_BATCH):
|
|
166
|
+
batch = sents[i : i + EMBED_BATCH]
|
|
167
|
+
enc = self._tok(
|
|
168
|
+
batch,
|
|
169
|
+
padding=True,
|
|
170
|
+
truncation=True,
|
|
171
|
+
max_length=MAX_TOKENS,
|
|
172
|
+
return_tensors="pt",
|
|
173
|
+
).to(self._dev)
|
|
174
|
+
hidden = self._enc(**enc).last_hidden_state.float()
|
|
175
|
+
mask = enc["attention_mask"].unsqueeze(-1).float()
|
|
176
|
+
pooled = (hidden * mask).sum(1) / mask.sum(1).clamp(min=1)
|
|
177
|
+
pooled = torch.nn.functional.normalize(pooled, dim=1)
|
|
178
|
+
out.append(pooled.cpu().numpy().astype(np.float32))
|
|
179
|
+
return np.concatenate(out, 0)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def load_encoder(backend: str = "openvino", offline: bool = True):
|
|
183
|
+
"""Factory: return an encoder for ``backend`` in {openvino, torch}."""
|
|
184
|
+
if backend == "openvino":
|
|
185
|
+
return OpenVINOEncoder(offline=offline)
|
|
186
|
+
if backend == "torch":
|
|
187
|
+
return TorchEncoder(offline=offline)
|
|
188
|
+
raise ValueError(f"unknown backend {backend!r}; choose 'openvino' or 'torch'")
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def download_models(backend: str = "openvino") -> list[str]:
|
|
192
|
+
"""Download and cache the models for ``backend`` in {openvino, torch, both}.
|
|
193
|
+
|
|
194
|
+
The only function that fetches from the Hub. huggingface_hub renders TQDM download bars.
|
|
195
|
+
"""
|
|
196
|
+
_require_models_extra()
|
|
197
|
+
_set_hf_token()
|
|
198
|
+
os.environ.pop("HF_HUB_OFFLINE", None) # force online for the install step
|
|
199
|
+
|
|
200
|
+
logger.info("downloading SAT segmenter '{}'", config.SAT_MODEL)
|
|
201
|
+
with contextlib.redirect_stderr(io.StringIO()):
|
|
202
|
+
from wtpsplit import SaT
|
|
203
|
+
|
|
204
|
+
SaT(config.SAT_MODEL)
|
|
205
|
+
|
|
206
|
+
backends = ["openvino", "torch"] if backend == "both" else [backend]
|
|
207
|
+
if "openvino" in backends:
|
|
208
|
+
if (config.MMBERT_OPENVINO_LOCAL / "openvino_model.xml").exists():
|
|
209
|
+
logger.info(
|
|
210
|
+
"openvino INT8 encoder already present at {}", config.MMBERT_OPENVINO_LOCAL
|
|
211
|
+
)
|
|
212
|
+
else:
|
|
213
|
+
logger.info("downloading '{}'", config.MMBERT_OPENVINO_HF)
|
|
214
|
+
from huggingface_hub import snapshot_download
|
|
215
|
+
|
|
216
|
+
snapshot_download(config.MMBERT_OPENVINO_HF)
|
|
217
|
+
if "torch" in backends:
|
|
218
|
+
logger.info("downloading '{}'", config.MMBERT_TORCH_MODEL)
|
|
219
|
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
|
220
|
+
|
|
221
|
+
conf = AutoConfig.from_pretrained(config.MMBERT_TORCH_MODEL)
|
|
222
|
+
conf.reference_compile = False
|
|
223
|
+
AutoTokenizer.from_pretrained(config.MMBERT_TORCH_MODEL)
|
|
224
|
+
AutoModel.from_pretrained(
|
|
225
|
+
config.MMBERT_TORCH_MODEL, config=conf, attn_implementation="eager"
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
logger.success("models ready for backend(s): {}", ", ".join(backends))
|
|
229
|
+
return backends
|
docdistance/features.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from loguru import logger
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
import typer
|
|
6
|
+
|
|
7
|
+
from docdistance.config import PROCESSED_DATA_DIR
|
|
8
|
+
|
|
9
|
+
app = typer.Typer()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@app.command()
|
|
13
|
+
def main(
|
|
14
|
+
# ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
|
|
15
|
+
input_path: Path = PROCESSED_DATA_DIR / "dataset.csv",
|
|
16
|
+
output_path: Path = PROCESSED_DATA_DIR / "features.csv",
|
|
17
|
+
# -----------------------------------------
|
|
18
|
+
):
|
|
19
|
+
# ---- REPLACE THIS WITH YOUR OWN CODE ----
|
|
20
|
+
logger.info("Generating features from dataset...")
|
|
21
|
+
for i in tqdm(range(10), total=10):
|
|
22
|
+
if i == 5:
|
|
23
|
+
logger.info("Something happened for iteration 5.")
|
|
24
|
+
logger.success("Features generation complete.")
|
|
25
|
+
# -----------------------------------------
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
app()
|
|
File without changes
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from loguru import logger
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
import typer
|
|
6
|
+
|
|
7
|
+
from docdistance.config import MODELS_DIR, PROCESSED_DATA_DIR
|
|
8
|
+
|
|
9
|
+
app = typer.Typer()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@app.command()
|
|
13
|
+
def main(
|
|
14
|
+
# ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
|
|
15
|
+
features_path: Path = PROCESSED_DATA_DIR / "test_features.csv",
|
|
16
|
+
model_path: Path = MODELS_DIR / "model.pkl",
|
|
17
|
+
predictions_path: Path = PROCESSED_DATA_DIR / "test_predictions.csv",
|
|
18
|
+
# -----------------------------------------
|
|
19
|
+
):
|
|
20
|
+
# ---- REPLACE THIS WITH YOUR OWN CODE ----
|
|
21
|
+
logger.info("Performing inference for model...")
|
|
22
|
+
for i in tqdm(range(10), total=10):
|
|
23
|
+
if i == 5:
|
|
24
|
+
logger.info("Something happened for iteration 5.")
|
|
25
|
+
logger.success("Inference complete.")
|
|
26
|
+
# -----------------------------------------
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
if __name__ == "__main__":
|
|
30
|
+
app()
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from loguru import logger
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
import typer
|
|
6
|
+
|
|
7
|
+
from docdistance.config import MODELS_DIR, PROCESSED_DATA_DIR
|
|
8
|
+
|
|
9
|
+
app = typer.Typer()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@app.command()
|
|
13
|
+
def main(
|
|
14
|
+
# ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
|
|
15
|
+
features_path: Path = PROCESSED_DATA_DIR / "features.csv",
|
|
16
|
+
labels_path: Path = PROCESSED_DATA_DIR / "labels.csv",
|
|
17
|
+
model_path: Path = MODELS_DIR / "model.pkl",
|
|
18
|
+
# -----------------------------------------
|
|
19
|
+
):
|
|
20
|
+
# ---- REPLACE THIS WITH YOUR OWN CODE ----
|
|
21
|
+
logger.info("Training some model...")
|
|
22
|
+
for i in tqdm(range(10), total=10):
|
|
23
|
+
if i == 5:
|
|
24
|
+
logger.info("Something happened for iteration 5.")
|
|
25
|
+
logger.success("Modeling training complete.")
|
|
26
|
+
# -----------------------------------------
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
if __name__ == "__main__":
|
|
30
|
+
app()
|
docdistance/pipeline.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""High-level document-distance API.
|
|
2
|
+
|
|
3
|
+
Two entry styles:
|
|
4
|
+
|
|
5
|
+
- :class:`DocDistance` - load the models once, score many pairs (the pipeline-integration entry)
|
|
6
|
+
- :func:`document_distance` / :func:`source_conditioned_distance` - one-shot convenience that loads
|
|
7
|
+
and scores in a single call
|
|
8
|
+
|
|
9
|
+
Inputs are raw text or a path to a text/markdown file (auto-detected). A leading markdown ``# `` title
|
|
10
|
+
line is stripped from files so the document title does not count as a statement.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
|
|
19
|
+
from docdistance import distance as _core
|
|
20
|
+
from docdistance.distance import (
|
|
21
|
+
DEFAULT_THRESHOLD,
|
|
22
|
+
DistanceResult,
|
|
23
|
+
SourceConditionedResult,
|
|
24
|
+
)
|
|
25
|
+
from docdistance.encoders import Segmenter, load_encoder
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _load_body(path: Path) -> str:
|
|
29
|
+
lines = path.read_text().splitlines()
|
|
30
|
+
return "\n".join(ln for ln in lines if not ln.startswith("# ")).strip()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _read(doc: str | Path) -> str:
|
|
34
|
+
"""Resolve a document argument: an existing file path is read, anything else is treated as text."""
|
|
35
|
+
if isinstance(doc, Path):
|
|
36
|
+
return _load_body(doc)
|
|
37
|
+
if isinstance(doc, str):
|
|
38
|
+
try:
|
|
39
|
+
p = Path(doc)
|
|
40
|
+
if p.exists() and p.is_file():
|
|
41
|
+
return _load_body(p)
|
|
42
|
+
except OSError:
|
|
43
|
+
pass
|
|
44
|
+
return doc.strip()
|
|
45
|
+
raise TypeError(f"document must be str or Path, got {type(doc).__name__}")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class DocDistance:
|
|
49
|
+
"""Reusable pipeline - construct once (models load here), then call :meth:`distance` per pair."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, backend: str = "openvino", offline: bool = True):
|
|
52
|
+
self.backend = backend
|
|
53
|
+
self.segmenter = Segmenter(offline=offline)
|
|
54
|
+
self.encoder = load_encoder(backend, offline=offline)
|
|
55
|
+
|
|
56
|
+
def embed(self, doc: str | Path) -> np.ndarray:
|
|
57
|
+
"""Segment then embed a document into L2-normalized statement vectors ``[n, dim]``."""
|
|
58
|
+
statements = self.segmenter.split(_read(doc))
|
|
59
|
+
if not statements:
|
|
60
|
+
raise ValueError("document produced no statements")
|
|
61
|
+
return self.encoder.encode(statements)
|
|
62
|
+
|
|
63
|
+
def distance(
|
|
64
|
+
self,
|
|
65
|
+
a: str | Path,
|
|
66
|
+
b: str | Path,
|
|
67
|
+
*,
|
|
68
|
+
anisotropy: bool = False,
|
|
69
|
+
threshold: float = DEFAULT_THRESHOLD,
|
|
70
|
+
) -> DistanceResult:
|
|
71
|
+
return _core.compute_distance(
|
|
72
|
+
self.embed(a), self.embed(b), anisotropy=anisotropy, threshold=threshold
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def distance_wrt_source(
|
|
76
|
+
self,
|
|
77
|
+
a: str | Path,
|
|
78
|
+
b: str | Path,
|
|
79
|
+
source: str | Path,
|
|
80
|
+
*,
|
|
81
|
+
anisotropy: bool = False,
|
|
82
|
+
) -> SourceConditionedResult:
|
|
83
|
+
return _core.compute_source_conditioned(
|
|
84
|
+
self.embed(a), self.embed(b), self.embed(source), anisotropy=anisotropy
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def document_distance(
|
|
89
|
+
a: str | Path,
|
|
90
|
+
b: str | Path,
|
|
91
|
+
*,
|
|
92
|
+
backend: str = "openvino",
|
|
93
|
+
anisotropy: bool = False,
|
|
94
|
+
threshold: float = DEFAULT_THRESHOLD,
|
|
95
|
+
offline: bool = True,
|
|
96
|
+
) -> DistanceResult:
|
|
97
|
+
"""Symmetric Statement Mover's Distance between documents ``a`` and ``b`` (loads models, then scores)."""
|
|
98
|
+
return DocDistance(backend=backend, offline=offline).distance(
|
|
99
|
+
a, b, anisotropy=anisotropy, threshold=threshold
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def source_conditioned_distance(
|
|
104
|
+
a: str | Path,
|
|
105
|
+
b: str | Path,
|
|
106
|
+
source: str | Path,
|
|
107
|
+
*,
|
|
108
|
+
backend: str = "openvino",
|
|
109
|
+
anisotropy: bool = False,
|
|
110
|
+
offline: bool = True,
|
|
111
|
+
) -> SourceConditionedResult:
|
|
112
|
+
"""Source-conditioned distance d(A, B | S): selection divergence + each document's distance to S."""
|
|
113
|
+
return DocDistance(backend=backend, offline=offline).distance_wrt_source(
|
|
114
|
+
a, b, source, anisotropy=anisotropy
|
|
115
|
+
)
|
docdistance/plots.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from loguru import logger
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
import typer
|
|
6
|
+
|
|
7
|
+
from docdistance.config import FIGURES_DIR, PROCESSED_DATA_DIR
|
|
8
|
+
|
|
9
|
+
app = typer.Typer()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@app.command()
|
|
13
|
+
def main(
|
|
14
|
+
# ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
|
|
15
|
+
input_path: Path = PROCESSED_DATA_DIR / "dataset.csv",
|
|
16
|
+
output_path: Path = FIGURES_DIR / "plot.png",
|
|
17
|
+
# -----------------------------------------
|
|
18
|
+
):
|
|
19
|
+
# ---- REPLACE THIS WITH YOUR OWN CODE ----
|
|
20
|
+
logger.info("Generating plot from data...")
|
|
21
|
+
for i in tqdm(range(10), total=10):
|
|
22
|
+
if i == 5:
|
|
23
|
+
logger.info("Something happened for iteration 5.")
|
|
24
|
+
logger.success("Plot generation complete.")
|
|
25
|
+
# -----------------------------------------
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
app()
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docdistance
|
|
3
|
+
Version: 1.0.15
|
|
4
|
+
Summary: Project that uses theory of From Word Embeddings To Document Distances / Optimal Transport to give meaningful distance from one document to another, useful if building agentic projects that convert or extract information from one document to another using frontier models but without the ability to calculate KL divergence from logits
|
|
5
|
+
Author: Stellars Henson <konrad.jelen+github@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/stellarshenson/docdistance
|
|
8
|
+
Project-URL: Repository, https://github.com/stellarshenson/docdistance
|
|
9
|
+
Project-URL: Issues, https://github.com/stellarshenson/docdistance/issues
|
|
10
|
+
Keywords: optimal-transport,word-movers-distance,statement-movers-distance,document-similarity,document-distance,embeddings,mmbert,nlp
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Requires-Python: ~=3.13.0
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: loguru
|
|
22
|
+
Requires-Dist: tqdm
|
|
23
|
+
Requires-Dist: typer
|
|
24
|
+
Requires-Dist: rich
|
|
25
|
+
Requires-Dist: python-dotenv
|
|
26
|
+
Requires-Dist: numpy
|
|
27
|
+
Requires-Dist: pot
|
|
28
|
+
Requires-Dist: transformers
|
|
29
|
+
Requires-Dist: wtpsplit
|
|
30
|
+
Requires-Dist: openvino
|
|
31
|
+
Requires-Dist: torch
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: build; extra == "dev"
|
|
34
|
+
Requires-Dist: ipykernel; extra == "dev"
|
|
35
|
+
Requires-Dist: ipython; extra == "dev"
|
|
36
|
+
Requires-Dist: nbdime; extra == "dev"
|
|
37
|
+
Requires-Dist: pip; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest; extra == "dev"
|
|
39
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
40
|
+
Requires-Dist: ruff; extra == "dev"
|
|
41
|
+
Requires-Dist: twine; extra == "dev"
|
|
42
|
+
Requires-Dist: matplotlib; extra == "dev"
|
|
43
|
+
Requires-Dist: seaborn; extra == "dev"
|
|
44
|
+
Dynamic: license-file
|
|
45
|
+
|
|
46
|
+
# docdistance
|
|
47
|
+
|
|
48
|
+
Semantic distance between two documents via Statement Mover's Distance - optimal transport over mmBERT statement embeddings, after Kusner et al. 2015 (*From Word Embeddings To Document Distances*). A thin frontend to the library; the SOTA docs carry the mechanics, benchmarks, and validation.
|
|
49
|
+
|
|
50
|
+
- **Input** - two documents, raw text or a file path
|
|
51
|
+
- **Output** - an SMD distance, a 0..1 closeness, a verdict, and the statement alignment
|
|
52
|
+
- **Use** - agentic document conversion and extraction pipelines, where token logits are unavailable and KL divergence cannot be computed
|
|
53
|
+
- **Unit** - statement-level and position-invariant, with an interpretable transport plan
|
|
54
|
+
|
|
55
|
+
## Theory
|
|
56
|
+
|
|
57
|
+
A document distance grounded in embeddings and optimal transport, not surface overlap.
|
|
58
|
+
|
|
59
|
+
- **WMD** - Word Mover's Distance (Kusner et al. 2015) casts document similarity as optimal transport between embedded tokens
|
|
60
|
+
- **SMD** - this project lifts it to statements: segment, embed, transport between the two statement clouds
|
|
61
|
+
- **Beyond cosine** - whole-document cosine collapses when the same claims sit in a different place or order; statement-level transport is position-invariant
|
|
62
|
+
- **Metric** - the ground cost `√(2 − 2cos)` on L2-normalized embeddings is a metric, so the document distance is one too
|
|
63
|
+
- **Logit-free** - an embedding-grounded alternative where token probabilities (KL divergence) are unavailable, as in frontier-model pipelines
|
|
64
|
+
|
|
65
|
+
## Method
|
|
66
|
+
|
|
67
|
+
Three stages; the transport plan is the interpretable by-product.
|
|
68
|
+
|
|
69
|
+
1. **Segment** - split each document into atomic statements with the SAT (Segment Any Text) segmenter
|
|
70
|
+
2. **Embed** - encode each statement with the mmBERT contextual encoder (mean-pooled, L2-normalized)
|
|
71
|
+
3. **Compare** - optimal transport between the two statement clouds (Statement Mover's Distance), optionally unbalanced so added or missing statements are scored, not force-matched
|
|
72
|
+
|
|
73
|
+
- **Closeness** - `1 − SMD/√2`, on a 0..1 scale
|
|
74
|
+
- **Source-conditioned** - a variant `d(A, B | S)` re-bases the transport onto a shared source `S` and reads off a selection axis and a grounding axis
|
|
75
|
+
|
|
76
|
+
## Usage
|
|
77
|
+
|
|
78
|
+
The library is the product; install once, then call it.
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from docdistance import document_distance
|
|
82
|
+
|
|
83
|
+
result = document_distance("report_v1.md", "report_v2.md")
|
|
84
|
+
print(result.closeness) # 0..1 similarity, 1 - SMD/sqrt(2)
|
|
85
|
+
print(result.verdict) # "similar" | "not similar"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
make install # environment, package, Jupyter kernel
|
|
90
|
+
docdistance install # download + cache the models (once)
|
|
91
|
+
docdistance distance a.md b.md # rich, coloured verdict
|
|
92
|
+
docdistance distance a.md b.md --json # machine-readable JSON
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
- **Offline after install** - distance calls run fully offline once the models are cached
|
|
96
|
+
- **Backend** - `--backend openvino|torch`, default `openvino` (CPU INT8)
|
|
97
|
+
- **Full API and flags** - `docdistance --help` and the SOTA docs
|
|
98
|
+
|
|
99
|
+
## Documentation
|
|
100
|
+
|
|
101
|
+
The SOTA documents explain how it works in detail; this README only introduces it.
|
|
102
|
+
|
|
103
|
+
- `docs/wmd-docdistance-solution-sota.md` - source-free distance: design, mechanism, performance, validation
|
|
104
|
+
- `docs/wmd-wrt-source-docdistance-solution.md` - source-conditioned distance `d(A,B|S)`
|
|
105
|
+
- `docs/mmbert-quantization-solution.md` - the INT8 / FP8 statement encoder
|
|
106
|
+
- `references/papers/from-word-embeddings-to-document-distances.md` - WMD paper digest (Kusner et al. 2015)
|
|
107
|
+
|
|
108
|
+
> **Note**: Scaffolded with the [copier-data-science](https://github.com/stellarshenson/copier-data-science) template.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
docdistance/__init__.py,sha256=7JPs0Q9AwNqWMFm8wcnrTWiP0SNnpAX2p0tA1NjT5gY,852
|
|
2
|
+
docdistance/cli.py,sha256=_bKjQf2gV-RpqJZ7d1RfwoUcZA18hk9bLvmCGcZz4Yg,8222
|
|
3
|
+
docdistance/config.py,sha256=5GZ7NYDn24Kg6ZBczNYlGiuyla_4_sOJ5zhuVx-6ZD4,2106
|
|
4
|
+
docdistance/dataset.py,sha256=huS5XJ_ydmV6rbtokI62GEgwQviUbe1HWHX5Hyutjmw,771
|
|
5
|
+
docdistance/distance.py,sha256=xklp-zY_uGa6EbTV1WYBVfs2XGx1Piy8eugf4eGO1h4,8415
|
|
6
|
+
docdistance/encoders.py,sha256=LND3YWJFRUP7XwN2N1npNrvfao25FzxtbpyFVXjQO_4,9315
|
|
7
|
+
docdistance/features.py,sha256=UU2MNtJ5gjcDm9j2QzQAQxFV6NAwnGKQIslUUreyKLU,774
|
|
8
|
+
docdistance/pipeline.py,sha256=lZisWFpzJqKUWYp80gSz05G8ZHjH1-JvKFd3WdX0SDs,3681
|
|
9
|
+
docdistance/plots.py,sha256=70V_HtIyDSlmoP5hW8Ub3tlm1Vi0X5wSeDZPINxzBJ4,765
|
|
10
|
+
docdistance/modeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
docdistance/modeling/predict.py,sha256=3B61xaFKG7GRXvwe3xhX5YmJa_M8P3iPWH3uGGcch5w,845
|
|
12
|
+
docdistance/modeling/train.py,sha256=OH6okBYDaUG8tLmEoyBTBHqZmJUvdVp22_hnIdNY0Bo,822
|
|
13
|
+
docdistance-1.0.15.dist-info/licenses/LICENSE,sha256=AtXSSglTQoyugtRKTcRY-XeGVUG83jnSek9Q3iM8rb8,1116
|
|
14
|
+
docdistance-1.0.15.dist-info/METADATA,sha256=uUxUlIpBThv-S6T6p5W3IvrKBNJpmwoAOcp-ZNIH23I,5561
|
|
15
|
+
docdistance-1.0.15.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
16
|
+
docdistance-1.0.15.dist-info/entry_points.txt,sha256=O3HurDhddDvy5pgGPEgac-y-FgDsBHkDPN7IIuJEKJ8,52
|
|
17
|
+
docdistance-1.0.15.dist-info/top_level.txt,sha256=BJP9ozRKdJaw9aUUmkAtLbrb-3QRc4oXgkSaZMGyJG8,12
|
|
18
|
+
docdistance-1.0.15.dist-info/RECORD,,
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
|
|
2
|
+
The MIT License (MIT)
|
|
3
|
+
Copyright (c) 2026, Stellars Henson <konrad.jelen+github@gmail.com>
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
10
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
docdistance
|