docdistance 1.0.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docdistance-1.0.15/LICENSE +10 -0
- docdistance-1.0.15/PKG-INFO +108 -0
- docdistance-1.0.15/README.md +63 -0
- docdistance-1.0.15/pyproject.toml +90 -0
- docdistance-1.0.15/setup.cfg +4 -0
- docdistance-1.0.15/src/docdistance/__init__.py +38 -0
- docdistance-1.0.15/src/docdistance/cli.py +229 -0
- docdistance-1.0.15/src/docdistance/config.py +66 -0
- docdistance-1.0.15/src/docdistance/dataset.py +29 -0
- docdistance-1.0.15/src/docdistance/distance.py +231 -0
- docdistance-1.0.15/src/docdistance/encoders.py +229 -0
- docdistance-1.0.15/src/docdistance/features.py +29 -0
- docdistance-1.0.15/src/docdistance/modeling/__init__.py +0 -0
- docdistance-1.0.15/src/docdistance/modeling/predict.py +30 -0
- docdistance-1.0.15/src/docdistance/modeling/train.py +30 -0
- docdistance-1.0.15/src/docdistance/pipeline.py +115 -0
- docdistance-1.0.15/src/docdistance/plots.py +29 -0
- docdistance-1.0.15/src/docdistance.egg-info/PKG-INFO +108 -0
- docdistance-1.0.15/src/docdistance.egg-info/SOURCES.txt +23 -0
- docdistance-1.0.15/src/docdistance.egg-info/dependency_links.txt +1 -0
- docdistance-1.0.15/src/docdistance.egg-info/entry_points.txt +2 -0
- docdistance-1.0.15/src/docdistance.egg-info/requires.txt +24 -0
- docdistance-1.0.15/src/docdistance.egg-info/top_level.txt +1 -0
- docdistance-1.0.15/tests/test_cli.py +48 -0
- docdistance-1.0.15/tests/test_distance.py +84 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
|
|
2
|
+
The MIT License (MIT)
|
|
3
|
+
Copyright (c) 2026, Stellars Henson <konrad.jelen+github@gmail.com>
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
10
|
+
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docdistance
|
|
3
|
+
Version: 1.0.15
|
|
4
|
+
Summary: Project that uses theory of From Word Embeddings To Document Distances / Optimal Transport to give meaningful distance from one document to another, useful if building agentic projects that convert or extract information from one document to another using frontier models but without the ability to calculate KL divergence from logits
|
|
5
|
+
Author: Stellars Henson <konrad.jelen+github@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/stellarshenson/docdistance
|
|
8
|
+
Project-URL: Repository, https://github.com/stellarshenson/docdistance
|
|
9
|
+
Project-URL: Issues, https://github.com/stellarshenson/docdistance/issues
|
|
10
|
+
Keywords: optimal-transport,word-movers-distance,statement-movers-distance,document-similarity,document-distance,embeddings,mmbert,nlp
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Requires-Python: ~=3.13.0
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: loguru
|
|
22
|
+
Requires-Dist: tqdm
|
|
23
|
+
Requires-Dist: typer
|
|
24
|
+
Requires-Dist: rich
|
|
25
|
+
Requires-Dist: python-dotenv
|
|
26
|
+
Requires-Dist: numpy
|
|
27
|
+
Requires-Dist: pot
|
|
28
|
+
Requires-Dist: transformers
|
|
29
|
+
Requires-Dist: wtpsplit
|
|
30
|
+
Requires-Dist: openvino
|
|
31
|
+
Requires-Dist: torch
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: build; extra == "dev"
|
|
34
|
+
Requires-Dist: ipykernel; extra == "dev"
|
|
35
|
+
Requires-Dist: ipython; extra == "dev"
|
|
36
|
+
Requires-Dist: nbdime; extra == "dev"
|
|
37
|
+
Requires-Dist: pip; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest; extra == "dev"
|
|
39
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
40
|
+
Requires-Dist: ruff; extra == "dev"
|
|
41
|
+
Requires-Dist: twine; extra == "dev"
|
|
42
|
+
Requires-Dist: matplotlib; extra == "dev"
|
|
43
|
+
Requires-Dist: seaborn; extra == "dev"
|
|
44
|
+
Dynamic: license-file
|
|
45
|
+
|
|
46
|
+
# docdistance
|
|
47
|
+
|
|
48
|
+
Semantic distance between two documents via Statement Mover's Distance - optimal transport over mmBERT statement embeddings, after Kusner et al. 2015 (*From Word Embeddings To Document Distances*). A thin frontend to the library; the SOTA docs carry the mechanics, benchmarks, and validation.
|
|
49
|
+
|
|
50
|
+
- **Input** - two documents, raw text or a file path
|
|
51
|
+
- **Output** - an SMD distance, a 0..1 closeness, a verdict, and the statement alignment
|
|
52
|
+
- **Use** - agentic document conversion and extraction pipelines, where token logits are unavailable and KL divergence cannot be computed
|
|
53
|
+
- **Unit** - statement-level and position-invariant, with an interpretable transport plan
|
|
54
|
+
|
|
55
|
+
## Theory
|
|
56
|
+
|
|
57
|
+
A document distance grounded in embeddings and optimal transport, not surface overlap.
|
|
58
|
+
|
|
59
|
+
- **WMD** - Word Mover's Distance (Kusner et al. 2015) casts document similarity as optimal transport between embedded tokens
|
|
60
|
+
- **SMD** - this project lifts it to statements: segment, embed, transport between the two statement clouds
|
|
61
|
+
- **Beyond cosine** - whole-document cosine collapses when the same claims sit in a different place or order; statement-level transport is position-invariant
|
|
62
|
+
- **Metric** - the ground cost `√(2 − 2cos)` on L2-normalized embeddings is a metric, so the document distance is one too
|
|
63
|
+
- **Logit-free** - an embedding-grounded alternative where token probabilities (KL divergence) are unavailable, as in frontier-model pipelines
|
|
64
|
+
|
|
65
|
+
## Method
|
|
66
|
+
|
|
67
|
+
Three stages; the transport plan is the interpretable by-product.
|
|
68
|
+
|
|
69
|
+
1. **Segment** - split each document into atomic statements with the SAT (Segment Any Text) segmenter
|
|
70
|
+
2. **Embed** - encode each statement with the mmBERT contextual encoder (mean-pooled, L2-normalized)
|
|
71
|
+
3. **Compare** - optimal transport between the two statement clouds (Statement Mover's Distance), optionally unbalanced so added or missing statements are scored, not force-matched
|
|
72
|
+
|
|
73
|
+
- **Closeness** - `1 − SMD/√2`, on a 0..1 scale
|
|
74
|
+
- **Source-conditioned** - a variant `d(A, B | S)` re-bases the transport onto a shared source `S` and reads off a selection axis and a grounding axis
|
|
75
|
+
|
|
76
|
+
## Usage
|
|
77
|
+
|
|
78
|
+
The library is the product; install once, then call it.
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from docdistance import document_distance
|
|
82
|
+
|
|
83
|
+
result = document_distance("report_v1.md", "report_v2.md")
|
|
84
|
+
print(result.closeness) # 0..1 similarity, 1 - SMD/sqrt(2)
|
|
85
|
+
print(result.verdict) # "similar" | "not similar"
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
make install # environment, package, Jupyter kernel
|
|
90
|
+
docdistance install # download + cache the models (once)
|
|
91
|
+
docdistance distance a.md b.md # rich, coloured verdict
|
|
92
|
+
docdistance distance a.md b.md --json # machine-readable JSON
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
- **Offline after install** - distance calls run fully offline once the models are cached
|
|
96
|
+
- **Backend** - `--backend openvino|torch`, default `openvino` (CPU INT8)
|
|
97
|
+
- **Full API and flags** - `docdistance --help` and the SOTA docs
|
|
98
|
+
|
|
99
|
+
## Documentation
|
|
100
|
+
|
|
101
|
+
The SOTA documents explain how it works in detail; this README only introduces it.
|
|
102
|
+
|
|
103
|
+
- `docs/wmd-docdistance-solution-sota.md` - source-free distance: design, mechanism, performance, validation
|
|
104
|
+
- `docs/wmd-wrt-source-docdistance-solution.md` - source-conditioned distance `d(A,B|S)`
|
|
105
|
+
- `docs/mmbert-quantization-solution.md` - the INT8 / FP8 statement encoder
|
|
106
|
+
- `references/papers/from-word-embeddings-to-document-distances.md` - WMD paper digest (Kusner et al. 2015)
|
|
107
|
+
|
|
108
|
+
> **Note**: Scaffolded with the [copier-data-science](https://github.com/stellarshenson/copier-data-science) template.
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# docdistance
|
|
2
|
+
|
|
3
|
+
Semantic distance between two documents via Statement Mover's Distance - optimal transport over mmBERT statement embeddings, after Kusner et al. 2015 (*From Word Embeddings To Document Distances*). A thin frontend to the library; the SOTA docs carry the mechanics, benchmarks, and validation.
|
|
4
|
+
|
|
5
|
+
- **Input** - two documents, raw text or a file path
|
|
6
|
+
- **Output** - an SMD distance, a 0..1 closeness, a verdict, and the statement alignment
|
|
7
|
+
- **Use** - agentic document conversion and extraction pipelines, where token logits are unavailable and KL divergence cannot be computed
|
|
8
|
+
- **Unit** - statement-level and position-invariant, with an interpretable transport plan
|
|
9
|
+
|
|
10
|
+
## Theory
|
|
11
|
+
|
|
12
|
+
A document distance grounded in embeddings and optimal transport, not surface overlap.
|
|
13
|
+
|
|
14
|
+
- **WMD** - Word Mover's Distance (Kusner et al. 2015) casts document similarity as optimal transport between embedded tokens
|
|
15
|
+
- **SMD** - this project lifts it to statements: segment, embed, transport between the two statement clouds
|
|
16
|
+
- **Beyond cosine** - whole-document cosine collapses when the same claims sit in a different place or order; statement-level transport is position-invariant
|
|
17
|
+
- **Metric** - the ground cost `√(2 − 2cos)` on L2-normalized embeddings is a metric, so the document distance is one too
|
|
18
|
+
- **Logit-free** - an embedding-grounded alternative where token probabilities (KL divergence) are unavailable, as in frontier-model pipelines
|
|
19
|
+
|
|
20
|
+
## Method
|
|
21
|
+
|
|
22
|
+
Three stages; the transport plan is the interpretable by-product.
|
|
23
|
+
|
|
24
|
+
1. **Segment** - split each document into atomic statements with the SAT (Segment Any Text) segmenter
|
|
25
|
+
2. **Embed** - encode each statement with the mmBERT contextual encoder (mean-pooled, L2-normalized)
|
|
26
|
+
3. **Compare** - optimal transport between the two statement clouds (Statement Mover's Distance), optionally unbalanced so added or missing statements are scored, not force-matched
|
|
27
|
+
|
|
28
|
+
- **Closeness** - `1 − SMD/√2`, on a 0..1 scale
|
|
29
|
+
- **Source-conditioned** - a variant `d(A, B | S)` re-bases the transport onto a shared source `S` and reads off a selection axis and a grounding axis
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
|
|
33
|
+
The library is the product; install once, then call it.
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from docdistance import document_distance
|
|
37
|
+
|
|
38
|
+
result = document_distance("report_v1.md", "report_v2.md")
|
|
39
|
+
print(result.closeness) # 0..1 similarity, 1 - SMD/sqrt(2)
|
|
40
|
+
print(result.verdict) # "similar" | "not similar"
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
make install # environment, package, Jupyter kernel
|
|
45
|
+
docdistance install # download + cache the models (once)
|
|
46
|
+
docdistance distance a.md b.md # rich, coloured verdict
|
|
47
|
+
docdistance distance a.md b.md --json # machine-readable JSON
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
- **Offline after install** - distance calls run fully offline once the models are cached
|
|
51
|
+
- **Backend** - `--backend openvino|torch`, default `openvino` (CPU INT8)
|
|
52
|
+
- **Full API and flags** - `docdistance --help` and the SOTA docs
|
|
53
|
+
|
|
54
|
+
## Documentation
|
|
55
|
+
|
|
56
|
+
The SOTA documents explain how it works in detail; this README only introduces it.
|
|
57
|
+
|
|
58
|
+
- `docs/wmd-docdistance-solution-sota.md` - source-free distance: design, mechanism, performance, validation
|
|
59
|
+
- `docs/wmd-wrt-source-docdistance-solution.md` - source-conditioned distance `d(A,B|S)`
|
|
60
|
+
- `docs/mmbert-quantization-solution.md` - the INT8 / FP8 statement encoder
|
|
61
|
+
- `references/papers/from-word-embeddings-to-document-distances.md` - WMD paper digest (Kusner et al. 2015)
|
|
62
|
+
|
|
63
|
+
> **Note**: Scaffolded with the [copier-data-science](https://github.com/stellarshenson/copier-data-science) template.
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "docdistance"
|
|
7
|
+
version = "1.0.15"
|
|
8
|
+
description = "Project that uses theory of From Word Embeddings To Document Distances / Optimal Transport to give meaningful distance from one document to another, useful if building agentic projects that convert or extract information from one document to another using frontier models but without the ability to calculate KL divergence from logits"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "Stellars Henson \u003ckonrad.jelen+github@gmail.com\u003e" },
|
|
11
|
+
]
|
|
12
|
+
license = "MIT"
|
|
13
|
+
readme = "README.md"
|
|
14
|
+
requires-python = "~=3.13.0"
|
|
15
|
+
keywords = [
|
|
16
|
+
"optimal-transport",
|
|
17
|
+
"word-movers-distance",
|
|
18
|
+
"statement-movers-distance",
|
|
19
|
+
"document-similarity",
|
|
20
|
+
"document-distance",
|
|
21
|
+
"embeddings",
|
|
22
|
+
"mmbert",
|
|
23
|
+
"nlp",
|
|
24
|
+
]
|
|
25
|
+
classifiers = [
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.13",
|
|
28
|
+
"Operating System :: OS Independent",
|
|
29
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
30
|
+
"Topic :: Text Processing :: Linguistic",
|
|
31
|
+
"Intended Audience :: Developers",
|
|
32
|
+
"Intended Audience :: Science/Research",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
dependencies = [
|
|
36
|
+
"loguru",
|
|
37
|
+
"tqdm",
|
|
38
|
+
"typer",
|
|
39
|
+
"rich",
|
|
40
|
+
"python-dotenv",
|
|
41
|
+
"numpy",
|
|
42
|
+
"pot",
|
|
43
|
+
"transformers",
|
|
44
|
+
"wtpsplit",
|
|
45
|
+
"openvino",
|
|
46
|
+
"torch",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[project.urls]
|
|
50
|
+
Homepage = "https://github.com/stellarshenson/docdistance"
|
|
51
|
+
Repository = "https://github.com/stellarshenson/docdistance"
|
|
52
|
+
Issues = "https://github.com/stellarshenson/docdistance/issues"
|
|
53
|
+
|
|
54
|
+
[project.scripts]
|
|
55
|
+
docdistance = "docdistance.cli:app"
|
|
56
|
+
|
|
57
|
+
[project.optional-dependencies]
|
|
58
|
+
dev = [
|
|
59
|
+
"build",
|
|
60
|
+
"ipykernel",
|
|
61
|
+
"ipython",
|
|
62
|
+
"nbdime",
|
|
63
|
+
"pip",
|
|
64
|
+
"pytest",
|
|
65
|
+
"pytest-cov",
|
|
66
|
+
"ruff",
|
|
67
|
+
"twine",
|
|
68
|
+
"matplotlib",
|
|
69
|
+
"seaborn",
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
[tool.setuptools]
|
|
73
|
+
include-package-data = true
|
|
74
|
+
|
|
75
|
+
[tool.setuptools.packages.find]
|
|
76
|
+
where = ["src"]
|
|
77
|
+
include = ["docdistance*"]
|
|
78
|
+
exclude = ["tests*"]
|
|
79
|
+
|
|
80
|
+
[tool.ruff]
|
|
81
|
+
line-length = 99
|
|
82
|
+
src = ["src/docdistance"]
|
|
83
|
+
include = ["pyproject.toml", "src/docdistance/**/*.py"]
|
|
84
|
+
|
|
85
|
+
[tool.ruff.lint]
|
|
86
|
+
extend-select = ["I"]
|
|
87
|
+
|
|
88
|
+
[tool.ruff.lint.isort]
|
|
89
|
+
known-first-party = ["docdistance"]
|
|
90
|
+
force-sort-within-sections = true
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
2
|
+
|
|
3
|
+
from docdistance import config # noqa: F401 (sets up logging + paths on import)
|
|
4
|
+
from docdistance.distance import (
|
|
5
|
+
DistanceResult,
|
|
6
|
+
SourceConditionedResult,
|
|
7
|
+
closeness,
|
|
8
|
+
compute_distance,
|
|
9
|
+
compute_source_conditioned,
|
|
10
|
+
rwmd,
|
|
11
|
+
smd,
|
|
12
|
+
wcd,
|
|
13
|
+
)
|
|
14
|
+
from docdistance.pipeline import (
|
|
15
|
+
DocDistance,
|
|
16
|
+
document_distance,
|
|
17
|
+
source_conditioned_distance,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
__version__ = version("docdistance")
|
|
22
|
+
except PackageNotFoundError: # running from source, not installed
|
|
23
|
+
__version__ = "0.0.0"
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"DocDistance",
|
|
27
|
+
"DistanceResult",
|
|
28
|
+
"SourceConditionedResult",
|
|
29
|
+
"document_distance",
|
|
30
|
+
"source_conditioned_distance",
|
|
31
|
+
"compute_distance",
|
|
32
|
+
"compute_source_conditioned",
|
|
33
|
+
"smd",
|
|
34
|
+
"wcd",
|
|
35
|
+
"rwmd",
|
|
36
|
+
"closeness",
|
|
37
|
+
"__version__",
|
|
38
|
+
]
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""docdistance command-line interface.
|
|
2
|
+
|
|
3
|
+
Three subcommands - ``install`` (the only one that downloads models), ``distance`` (symmetric SMD)
|
|
4
|
+
and ``distance-wrt-source`` (source-conditioned). Human output is rich and coloured on a capable
|
|
5
|
+
terminal; ``--json`` emits machine-readable JSON and ``--result-only`` emits the bare result.
|
|
6
|
+
Logs go to stderr (loguru, ``--verbose`` for DEBUG), so stdout carries only the result.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from enum import Enum
|
|
12
|
+
import json
|
|
13
|
+
|
|
14
|
+
from rich.console import Console
|
|
15
|
+
from rich.panel import Panel
|
|
16
|
+
from rich.table import Table
|
|
17
|
+
import typer
|
|
18
|
+
|
|
19
|
+
from docdistance.config import configure_logging
|
|
20
|
+
from docdistance.distance import DEFAULT_THRESHOLD
|
|
21
|
+
|
|
22
|
+
app = typer.Typer(
|
|
23
|
+
rich_markup_mode="rich",
|
|
24
|
+
no_args_is_help=True,
|
|
25
|
+
add_completion=False,
|
|
26
|
+
help="[bold]docdistance[/bold] - semantic distance between documents via Statement Mover's Distance "
|
|
27
|
+
"(optimal transport over mmBERT statement embeddings).",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
_out = Console() # stdout, for the result
|
|
31
|
+
_err = Console(stderr=True) # stderr, for errors
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Backend(str, Enum):
|
|
35
|
+
openvino = "openvino"
|
|
36
|
+
torch = "torch"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class InstallBackend(str, Enum):
|
|
40
|
+
openvino = "openvino"
|
|
41
|
+
torch = "torch"
|
|
42
|
+
both = "both"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _version_cb(value: bool):
|
|
46
|
+
if value:
|
|
47
|
+
from docdistance import __version__
|
|
48
|
+
|
|
49
|
+
typer.echo(f"docdistance {__version__}")
|
|
50
|
+
raise typer.Exit()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@app.callback()
|
|
54
|
+
def main(
|
|
55
|
+
version: bool = typer.Option(
|
|
56
|
+
False, "--version", callback=_version_cb, is_eager=True, help="show version and exit"
|
|
57
|
+
),
|
|
58
|
+
):
|
|
59
|
+
"""Semantic document distance grounded in optimal-transport theory."""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _run(fn):
|
|
63
|
+
"""Call ``fn`` and turn a missing-model error into a clean message + exit code 1."""
|
|
64
|
+
from docdistance.encoders import ModelsNotInstalled
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
return fn()
|
|
68
|
+
except ModelsNotInstalled as exc:
|
|
69
|
+
_err.print(f"[bold red]error:[/bold red] {exc}")
|
|
70
|
+
raise typer.Exit(1)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _emit_distance(r, json_out: bool, result_only: bool) -> None:
|
|
74
|
+
if result_only:
|
|
75
|
+
typer.echo(str(r.smd))
|
|
76
|
+
return
|
|
77
|
+
if json_out:
|
|
78
|
+
typer.echo(json.dumps(r.to_dict(), indent=2))
|
|
79
|
+
return
|
|
80
|
+
color = "green" if r.verdict == "similar" else "red"
|
|
81
|
+
grid = Table.grid(padding=(0, 2))
|
|
82
|
+
grid.add_column(style="bold cyan")
|
|
83
|
+
grid.add_column()
|
|
84
|
+
grid.add_row("SMD (distance)", f"{r.smd:.4f}")
|
|
85
|
+
grid.add_row("closeness", f"{r.closeness * 100:.1f}%")
|
|
86
|
+
grid.add_row(
|
|
87
|
+
"verdict", f"[{color}]{r.verdict}[/{color}] (threshold {r.threshold:.2f} closeness)"
|
|
88
|
+
)
|
|
89
|
+
grid.add_row("bounds", f"WCD {r.wcd:.4f} ≤ RWMD {r.rwmd:.4f} ≤ SMD {r.smd:.4f}")
|
|
90
|
+
grid.add_row("statements", f"{r.n_statements_a} vs {r.n_statements_b}")
|
|
91
|
+
grid.add_row("anisotropy", "on" if r.anisotropy else "off")
|
|
92
|
+
_out.print(
|
|
93
|
+
Panel(grid, title="[bold]Document distance[/bold]", border_style=color, expand=False)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _emit_wrt_source(r, json_out: bool, result_only: bool) -> None:
|
|
98
|
+
if result_only:
|
|
99
|
+
typer.echo(f"{r.d_sel},{r.residual_a},{r.residual_b}")
|
|
100
|
+
return
|
|
101
|
+
if json_out:
|
|
102
|
+
typer.echo(json.dumps(r.to_dict(), indent=2))
|
|
103
|
+
return
|
|
104
|
+
grid = Table.grid(padding=(0, 2))
|
|
105
|
+
grid.add_column(style="bold cyan")
|
|
106
|
+
grid.add_column()
|
|
107
|
+
grid.add_row("D_sel (selection divergence)", f"{r.d_sel:.4f}")
|
|
108
|
+
grid.add_row("A → source", f"{r.residual_a:.4f} (closeness {r.closeness_a * 100:.1f}%)")
|
|
109
|
+
grid.add_row("B → source", f"{r.residual_b:.4f} (closeness {r.closeness_b * 100:.1f}%)")
|
|
110
|
+
grid.add_row(
|
|
111
|
+
"statements", f"A {r.n_statements_a} / B {r.n_statements_b} / S {r.n_statements_source}"
|
|
112
|
+
)
|
|
113
|
+
_out.print(
|
|
114
|
+
Panel(
|
|
115
|
+
grid,
|
|
116
|
+
title="[bold]Source-conditioned distance d(A,B|S)[/bold]",
|
|
117
|
+
border_style="cyan",
|
|
118
|
+
expand=False,
|
|
119
|
+
)
|
|
120
|
+
)
|
|
121
|
+
_out.print(
|
|
122
|
+
"[dim]residual = geometric distance to the source; the reranker + NLI grounding grade and "
|
|
123
|
+
"numeric verifier are deferred to E02[/dim]"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@app.command(
|
|
128
|
+
epilog="[bold]Examples[/bold]\n\n"
|
|
129
|
+
" docdistance distance report_v1.md report_v2.md\n"
|
|
130
|
+
' docdistance distance "first text" "second text" --backend torch\n'
|
|
131
|
+
" docdistance distance a.md b.md --json\n"
|
|
132
|
+
" docdistance distance a.md b.md --result-only"
|
|
133
|
+
)
|
|
134
|
+
def distance(
|
|
135
|
+
a: str = typer.Argument(..., help="first document - a file path or raw text"),
|
|
136
|
+
b: str = typer.Argument(..., help="second document - a file path or raw text"),
|
|
137
|
+
backend: Backend = typer.Option(
|
|
138
|
+
Backend.openvino, "--backend", help="statement encoder backend"
|
|
139
|
+
),
|
|
140
|
+
anisotropy: bool = typer.Option(
|
|
141
|
+
False,
|
|
142
|
+
"--anisotropy/--no-anisotropy",
|
|
143
|
+
help="all-but-the-top anisotropy removal - needs a corpus, off by default for a pair",
|
|
144
|
+
),
|
|
145
|
+
threshold: float = typer.Option(
|
|
146
|
+
DEFAULT_THRESHOLD,
|
|
147
|
+
"--threshold",
|
|
148
|
+
help="closeness cutoff for the similar / not-similar verdict",
|
|
149
|
+
),
|
|
150
|
+
json_out: bool = typer.Option(False, "--json", help="machine-readable JSON to stdout"),
|
|
151
|
+
result_only: bool = typer.Option(
|
|
152
|
+
False, "--result-only", help="bare SMD scalar to stdout, no clutter"
|
|
153
|
+
),
|
|
154
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="DEBUG logging to stderr"),
|
|
155
|
+
):
|
|
156
|
+
"""Symmetric distance between two documents - the exact Statement Mover's Distance."""
|
|
157
|
+
configure_logging(verbose)
|
|
158
|
+
from docdistance.pipeline import document_distance
|
|
159
|
+
|
|
160
|
+
result = _run(
|
|
161
|
+
lambda: document_distance(
|
|
162
|
+
a, b, backend=backend.value, anisotropy=anisotropy, threshold=threshold
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
_emit_distance(result, json_out, result_only)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@app.command(
|
|
169
|
+
name="distance-wrt-source",
|
|
170
|
+
epilog="[bold]Examples[/bold]\n\n"
|
|
171
|
+
" docdistance distance-wrt-source summary_a.md summary_b.md --source article.md\n"
|
|
172
|
+
" docdistance distance-wrt-source a.md b.md -s s.md --json\n"
|
|
173
|
+
" docdistance distance-wrt-source a.md b.md -s s.md --result-only [dim]# D_sel,res_a,res_b[/dim]",
|
|
174
|
+
)
|
|
175
|
+
def distance_wrt_source(
|
|
176
|
+
a: str = typer.Argument(..., help="first document - a file path or raw text"),
|
|
177
|
+
b: str = typer.Argument(..., help="second document - a file path or raw text"),
|
|
178
|
+
source: str = typer.Option(..., "--source", "-s", help="the common source document"),
|
|
179
|
+
backend: Backend = typer.Option(
|
|
180
|
+
Backend.openvino, "--backend", help="statement encoder backend"
|
|
181
|
+
),
|
|
182
|
+
anisotropy: bool = typer.Option(
|
|
183
|
+
False,
|
|
184
|
+
"--anisotropy/--no-anisotropy",
|
|
185
|
+
help="anisotropy removal - needs a corpus, off by default",
|
|
186
|
+
),
|
|
187
|
+
json_out: bool = typer.Option(False, "--json", help="machine-readable JSON to stdout"),
|
|
188
|
+
result_only: bool = typer.Option(
|
|
189
|
+
False, "--result-only", help="bare comma-separated D_sel,residual_a,residual_b to stdout"
|
|
190
|
+
),
|
|
191
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="DEBUG logging to stderr"),
|
|
192
|
+
):
|
|
193
|
+
"""Source-conditioned distance d(A, B | S) - selection divergence plus each document's distance to S."""
|
|
194
|
+
configure_logging(verbose)
|
|
195
|
+
from docdistance.pipeline import source_conditioned_distance
|
|
196
|
+
|
|
197
|
+
result = _run(
|
|
198
|
+
lambda: source_conditioned_distance(
|
|
199
|
+
a, b, source, backend=backend.value, anisotropy=anisotropy
|
|
200
|
+
)
|
|
201
|
+
)
|
|
202
|
+
_emit_wrt_source(result, json_out, result_only)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@app.command(
|
|
206
|
+
epilog="[bold]Examples[/bold]\n\n"
|
|
207
|
+
" docdistance install [dim]# both backends[/dim]\n"
|
|
208
|
+
" docdistance install --backend openvino",
|
|
209
|
+
)
|
|
210
|
+
def install(
|
|
211
|
+
backend: InstallBackend = typer.Option(
|
|
212
|
+
InstallBackend.both, "--backend", help="which encoder weights to fetch"
|
|
213
|
+
),
|
|
214
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="DEBUG logging to stderr"),
|
|
215
|
+
):
|
|
216
|
+
"""Download and cache the models - the only command that fetches from the Hub (TQDM progress bars)."""
|
|
217
|
+
configure_logging(verbose)
|
|
218
|
+
from docdistance.encoders import ModelsNotInstalled, download_models
|
|
219
|
+
|
|
220
|
+
try:
|
|
221
|
+
backends = download_models(backend.value)
|
|
222
|
+
except ModelsNotInstalled as exc:
|
|
223
|
+
_err.print(f"[bold red]error:[/bold red] {exc}")
|
|
224
|
+
raise typer.Exit(1)
|
|
225
|
+
_out.print(f"[green]models ready:[/green] {', '.join(backends)}")
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
if __name__ == "__main__":
|
|
229
|
+
app()
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
from loguru import logger
|
|
6
|
+
|
|
7
|
+
########### SETUP ###############
|
|
8
|
+
|
|
9
|
+
# set up logger - INFO by default (DEBUG only via the CLI --verbose flag), sink to stderr so
|
|
10
|
+
# stdout stays clean for --json / --result-only output
|
|
11
|
+
logger.remove()
|
|
12
|
+
logger.add(sys.stderr, colorize=True, level="INFO")
|
|
13
|
+
|
|
14
|
+
# If tqdm is installed, configure loguru with tqdm.write
|
|
15
|
+
# https://github.com/Delgan/loguru/issues/135
|
|
16
|
+
try:
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
|
|
19
|
+
logger.remove()
|
|
20
|
+
logger.add(lambda msg: tqdm.write(msg, end="", file=sys.stderr), colorize=True, level="INFO")
|
|
21
|
+
except ModuleNotFoundError:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
########## VARIABLES ############
|
|
25
|
+
|
|
26
|
+
# Load environment variables from .env file if it exists
|
|
27
|
+
load_dotenv()
|
|
28
|
+
|
|
29
|
+
# paths
|
|
30
|
+
PROJ_ROOT = Path(__file__).resolve().parents[2]
|
|
31
|
+
DATA_DIR = PROJ_ROOT / "data"
|
|
32
|
+
RAW_DATA_DIR = DATA_DIR / "raw"
|
|
33
|
+
INTERIM_DATA_DIR = DATA_DIR / "interim"
|
|
34
|
+
PROCESSED_DATA_DIR = DATA_DIR / "processed"
|
|
35
|
+
EXTERNAL_DATA_DIR = DATA_DIR / "external"
|
|
36
|
+
MODELS_DIR = PROJ_ROOT / "models"
|
|
37
|
+
REPORTS_DIR = PROJ_ROOT / "reports"
|
|
38
|
+
FIGURES_DIR = REPORTS_DIR / "figures"
|
|
39
|
+
|
|
40
|
+
# log current root dir (debug so it never pollutes machine-readable stdout)
|
|
41
|
+
logger.debug(f"PROJ_ROOT path is: {PROJ_ROOT}")
|
|
42
|
+
|
|
43
|
+
########## MODELS ###############
|
|
44
|
+
|
|
45
|
+
# segmenter (wtpsplit SaT) and the mmBERT statement encoders, by backend
|
|
46
|
+
SAT_MODEL = "sat-3l-sm"
|
|
47
|
+
MMBERT_TORCH_MODEL = "jhu-clsp/mmBERT-base"
|
|
48
|
+
MMBERT_OPENVINO_LOCAL = MODELS_DIR / "02-mmbert-openvino-int8"
|
|
49
|
+
MMBERT_OPENVINO_HF = "stellars/mmBERT-base-openvino-int8"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def configure_logging(verbose: bool = False) -> None:
|
|
53
|
+
"""Re-point loguru at stderr at INFO, or DEBUG when ``verbose`` - the CLI calls this first.
|
|
54
|
+
|
|
55
|
+
stderr keeps stdout reserved for the result so ``--json`` and ``--result-only`` stay machine-parseable.
|
|
56
|
+
"""
|
|
57
|
+
level = "DEBUG" if verbose else "INFO"
|
|
58
|
+
logger.remove()
|
|
59
|
+
try:
|
|
60
|
+
from tqdm import tqdm
|
|
61
|
+
|
|
62
|
+
logger.add(
|
|
63
|
+
lambda msg: tqdm.write(msg, end="", file=sys.stderr), colorize=True, level=level
|
|
64
|
+
)
|
|
65
|
+
except ModuleNotFoundError:
|
|
66
|
+
logger.add(sys.stderr, colorize=True, level=level)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from loguru import logger
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
import typer
|
|
6
|
+
|
|
7
|
+
from docdistance.config import PROCESSED_DATA_DIR, RAW_DATA_DIR
|
|
8
|
+
|
|
9
|
+
app = typer.Typer()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@app.command()
|
|
13
|
+
def main(
|
|
14
|
+
# ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
|
|
15
|
+
input_path: Path = RAW_DATA_DIR / "dataset.csv",
|
|
16
|
+
output_path: Path = PROCESSED_DATA_DIR / "dataset.csv",
|
|
17
|
+
# ----------------------------------------------
|
|
18
|
+
):
|
|
19
|
+
# ---- REPLACE THIS WITH YOUR OWN CODE ----
|
|
20
|
+
logger.info("Processing dataset...")
|
|
21
|
+
for i in tqdm(range(10), total=10):
|
|
22
|
+
if i == 5:
|
|
23
|
+
logger.info("Something happened for iteration 5.")
|
|
24
|
+
logger.success("Processing dataset complete.")
|
|
25
|
+
# -----------------------------------------
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
if __name__ == "__main__":
|
|
29
|
+
app()
|