docdistance 1.0.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+
2
+ The MIT License (MIT)
3
+ Copyright (c) 2026, Stellars Henson <konrad.jelen+github@gmail.com>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10
+
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.4
2
+ Name: docdistance
3
+ Version: 1.0.15
4
+ Summary: Project that uses theory of From Word Embeddings To Document Distances / Optimal Transport to give meaningful distance from one document to another, useful if building agentic projects that convert or extract information from one document to another using frontier models but without the ability to calculate KL divergence from logits
5
+ Author: Stellars Henson <konrad.jelen+github@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/stellarshenson/docdistance
8
+ Project-URL: Repository, https://github.com/stellarshenson/docdistance
9
+ Project-URL: Issues, https://github.com/stellarshenson/docdistance/issues
10
+ Keywords: optimal-transport,word-movers-distance,statement-movers-distance,document-similarity,document-distance,embeddings,mmbert,nlp
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Classifier: Topic :: Text Processing :: Linguistic
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Science/Research
18
+ Requires-Python: ~=3.13.0
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: loguru
22
+ Requires-Dist: tqdm
23
+ Requires-Dist: typer
24
+ Requires-Dist: rich
25
+ Requires-Dist: python-dotenv
26
+ Requires-Dist: numpy
27
+ Requires-Dist: pot
28
+ Requires-Dist: transformers
29
+ Requires-Dist: wtpsplit
30
+ Requires-Dist: openvino
31
+ Requires-Dist: torch
32
+ Provides-Extra: dev
33
+ Requires-Dist: build; extra == "dev"
34
+ Requires-Dist: ipykernel; extra == "dev"
35
+ Requires-Dist: ipython; extra == "dev"
36
+ Requires-Dist: nbdime; extra == "dev"
37
+ Requires-Dist: pip; extra == "dev"
38
+ Requires-Dist: pytest; extra == "dev"
39
+ Requires-Dist: pytest-cov; extra == "dev"
40
+ Requires-Dist: ruff; extra == "dev"
41
+ Requires-Dist: twine; extra == "dev"
42
+ Requires-Dist: matplotlib; extra == "dev"
43
+ Requires-Dist: seaborn; extra == "dev"
44
+ Dynamic: license-file
45
+
46
+ # docdistance
47
+
48
+ Semantic distance between two documents via Statement Mover's Distance - optimal transport over mmBERT statement embeddings, after Kusner et al. 2015 (*From Word Embeddings To Document Distances*). A thin frontend to the library; the SOTA docs carry the mechanics, benchmarks, and validation.
49
+
50
+ - **Input** - two documents, raw text or a file path
51
+ - **Output** - an SMD distance, a 0..1 closeness, a verdict, and the statement alignment
52
+ - **Use** - agentic document conversion and extraction pipelines, where token logits are unavailable and KL divergence cannot be computed
53
+ - **Unit** - statement-level and position-invariant, with an interpretable transport plan
54
+
55
+ ## Theory
56
+
57
+ A document distance grounded in embeddings and optimal transport, not surface overlap.
58
+
59
+ - **WMD** - Word Mover's Distance (Kusner et al. 2015) casts document similarity as optimal transport between embedded tokens
60
+ - **SMD** - this project lifts it to statements: segment, embed, transport between the two statement clouds
61
+ - **Beyond cosine** - whole-document cosine collapses when the same claims sit in a different place or order; statement-level transport is position-invariant
62
+ - **Metric** - the ground cost `√(2 − 2cos)` on L2-normalized embeddings is a metric, so the document distance is one too
63
+ - **Logit-free** - an embedding-grounded alternative where token probabilities (KL divergence) are unavailable, as in frontier-model pipelines
64
+
65
+ ## Method
66
+
67
+ Three stages; the transport plan is the interpretable by-product.
68
+
69
+ 1. **Segment** - split each document into atomic statements with the SAT (Segment Any Text) segmenter
70
+ 2. **Embed** - encode each statement with the mmBERT contextual encoder (mean-pooled, L2-normalized)
71
+ 3. **Compare** - optimal transport between the two statement clouds (Statement Mover's Distance), optionally unbalanced so added or missing statements are scored, not force-matched
72
+
73
+ - **Closeness** - `1 − SMD/√2`, on a 0..1 scale
74
+ - **Source-conditioned** - a variant `d(A, B | S)` re-bases the transport onto a shared source `S` and reads off a selection axis and a grounding axis
75
+
76
+ ## Usage
77
+
78
+ The library is the product; install once, then call it.
79
+
80
+ ```python
81
+ from docdistance import document_distance
82
+
83
+ result = document_distance("report_v1.md", "report_v2.md")
84
+ print(result.closeness) # 0..1 similarity, 1 - SMD/sqrt(2)
85
+ print(result.verdict) # "similar" | "not similar"
86
+ ```
87
+
88
+ ```bash
89
+ make install # environment, package, Jupyter kernel
90
+ docdistance install # download + cache the models (once)
91
+ docdistance distance a.md b.md # rich, coloured verdict
92
+ docdistance distance a.md b.md --json # machine-readable JSON
93
+ ```
94
+
95
+ - **Offline after install** - distance calls run fully offline once the models are cached
96
+ - **Backend** - `--backend openvino|torch`, default `openvino` (CPU INT8)
97
+ - **Full API and flags** - `docdistance --help` and the SOTA docs
98
+
99
+ ## Documentation
100
+
101
+ The SOTA documents explain how it works in detail; this README only introduces it.
102
+
103
+ - `docs/wmd-docdistance-solution-sota.md` - source-free distance: design, mechanism, performance, validation
104
+ - `docs/wmd-wrt-source-docdistance-solution.md` - source-conditioned distance `d(A,B|S)`
105
+ - `docs/mmbert-quantization-solution.md` - the INT8 / FP8 statement encoder
106
+ - `references/papers/from-word-embeddings-to-document-distances.md` - WMD paper digest (Kusner et al. 2015)
107
+
108
+ > **Note**: Scaffolded with the [copier-data-science](https://github.com/stellarshenson/copier-data-science) template.
@@ -0,0 +1,63 @@
1
+ # docdistance
2
+
3
+ Semantic distance between two documents via Statement Mover's Distance - optimal transport over mmBERT statement embeddings, after Kusner et al. 2015 (*From Word Embeddings To Document Distances*). A thin frontend to the library; the SOTA docs carry the mechanics, benchmarks, and validation.
4
+
5
+ - **Input** - two documents, raw text or a file path
6
+ - **Output** - an SMD distance, a 0..1 closeness, a verdict, and the statement alignment
7
+ - **Use** - agentic document conversion and extraction pipelines, where token logits are unavailable and KL divergence cannot be computed
8
+ - **Unit** - statement-level and position-invariant, with an interpretable transport plan
9
+
10
+ ## Theory
11
+
12
+ A document distance grounded in embeddings and optimal transport, not surface overlap.
13
+
14
+ - **WMD** - Word Mover's Distance (Kusner et al. 2015) casts document similarity as optimal transport between embedded tokens
15
+ - **SMD** - this project lifts it to statements: segment, embed, transport between the two statement clouds
16
+ - **Beyond cosine** - whole-document cosine collapses when the same claims sit in a different place or order; statement-level transport is position-invariant
17
+ - **Metric** - the ground cost `√(2 − 2cos)` on L2-normalized embeddings is a metric, so the document distance is one too
18
+ - **Logit-free** - an embedding-grounded alternative where token probabilities (KL divergence) are unavailable, as in frontier-model pipelines
19
+
20
+ ## Method
21
+
22
+ Three stages; the transport plan is the interpretable by-product.
23
+
24
+ 1. **Segment** - split each document into atomic statements with the SAT (Segment Any Text) segmenter
25
+ 2. **Embed** - encode each statement with the mmBERT contextual encoder (mean-pooled, L2-normalized)
26
+ 3. **Compare** - optimal transport between the two statement clouds (Statement Mover's Distance), optionally unbalanced so added or missing statements are scored, not force-matched
27
+
28
+ - **Closeness** - `1 − SMD/√2`, on a 0..1 scale
29
+ - **Source-conditioned** - a variant `d(A, B | S)` re-bases the transport onto a shared source `S` and reads off a selection axis and a grounding axis
30
+
31
+ ## Usage
32
+
33
+ The library is the product; install once, then call it.
34
+
35
+ ```python
36
+ from docdistance import document_distance
37
+
38
+ result = document_distance("report_v1.md", "report_v2.md")
39
+ print(result.closeness) # 0..1 similarity, 1 - SMD/sqrt(2)
40
+ print(result.verdict) # "similar" | "not similar"
41
+ ```
42
+
43
+ ```bash
44
+ make install # environment, package, Jupyter kernel
45
+ docdistance install # download + cache the models (once)
46
+ docdistance distance a.md b.md # rich, coloured verdict
47
+ docdistance distance a.md b.md --json # machine-readable JSON
48
+ ```
49
+
50
+ - **Offline after install** - distance calls run fully offline once the models are cached
51
+ - **Backend** - `--backend openvino|torch`, default `openvino` (CPU INT8)
52
+ - **Full API and flags** - `docdistance --help` and the SOTA docs
53
+
54
+ ## Documentation
55
+
56
+ The SOTA documents explain how it works in detail; this README only introduces it.
57
+
58
+ - `docs/wmd-docdistance-solution-sota.md` - source-free distance: design, mechanism, performance, validation
59
+ - `docs/wmd-wrt-source-docdistance-solution.md` - source-conditioned distance `d(A,B|S)`
60
+ - `docs/mmbert-quantization-solution.md` - the INT8 / FP8 statement encoder
61
+ - `references/papers/from-word-embeddings-to-document-distances.md` - WMD paper digest (Kusner et al. 2015)
62
+
63
+ > **Note**: Scaffolded with the [copier-data-science](https://github.com/stellarshenson/copier-data-science) template.
@@ -0,0 +1,90 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "docdistance"
7
+ version = "1.0.15"
8
+ description = "Project that uses theory of From Word Embeddings To Document Distances / Optimal Transport to give meaningful distance from one document to another, useful if building agentic projects that convert or extract information from one document to another using frontier models but without the ability to calculate KL divergence from logits"
9
+ authors = [
10
+ { name = "Stellars Henson \u003ckonrad.jelen+github@gmail.com\u003e" },
11
+ ]
12
+ license = "MIT"
13
+ readme = "README.md"
14
+ requires-python = "~=3.13.0"
15
+ keywords = [
16
+ "optimal-transport",
17
+ "word-movers-distance",
18
+ "statement-movers-distance",
19
+ "document-similarity",
20
+ "document-distance",
21
+ "embeddings",
22
+ "mmbert",
23
+ "nlp",
24
+ ]
25
+ classifiers = [
26
+ "Programming Language :: Python :: 3",
27
+ "Programming Language :: Python :: 3.13",
28
+ "Operating System :: OS Independent",
29
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
30
+ "Topic :: Text Processing :: Linguistic",
31
+ "Intended Audience :: Developers",
32
+ "Intended Audience :: Science/Research",
33
+ ]
34
+
35
+ dependencies = [
36
+ "loguru",
37
+ "tqdm",
38
+ "typer",
39
+ "rich",
40
+ "python-dotenv",
41
+ "numpy",
42
+ "pot",
43
+ "transformers",
44
+ "wtpsplit",
45
+ "openvino",
46
+ "torch",
47
+ ]
48
+
49
+ [project.urls]
50
+ Homepage = "https://github.com/stellarshenson/docdistance"
51
+ Repository = "https://github.com/stellarshenson/docdistance"
52
+ Issues = "https://github.com/stellarshenson/docdistance/issues"
53
+
54
+ [project.scripts]
55
+ docdistance = "docdistance.cli:app"
56
+
57
+ [project.optional-dependencies]
58
+ dev = [
59
+ "build",
60
+ "ipykernel",
61
+ "ipython",
62
+ "nbdime",
63
+ "pip",
64
+ "pytest",
65
+ "pytest-cov",
66
+ "ruff",
67
+ "twine",
68
+ "matplotlib",
69
+ "seaborn",
70
+ ]
71
+
72
+ [tool.setuptools]
73
+ include-package-data = true
74
+
75
+ [tool.setuptools.packages.find]
76
+ where = ["src"]
77
+ include = ["docdistance*"]
78
+ exclude = ["tests*"]
79
+
80
+ [tool.ruff]
81
+ line-length = 99
82
+ src = ["src/docdistance"]
83
+ include = ["pyproject.toml", "src/docdistance/**/*.py"]
84
+
85
+ [tool.ruff.lint]
86
+ extend-select = ["I"]
87
+
88
+ [tool.ruff.lint.isort]
89
+ known-first-party = ["docdistance"]
90
+ force-sort-within-sections = true
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,38 @@
1
+ from importlib.metadata import PackageNotFoundError, version
2
+
3
+ from docdistance import config # noqa: F401 (sets up logging + paths on import)
4
+ from docdistance.distance import (
5
+ DistanceResult,
6
+ SourceConditionedResult,
7
+ closeness,
8
+ compute_distance,
9
+ compute_source_conditioned,
10
+ rwmd,
11
+ smd,
12
+ wcd,
13
+ )
14
+ from docdistance.pipeline import (
15
+ DocDistance,
16
+ document_distance,
17
+ source_conditioned_distance,
18
+ )
19
+
20
+ try:
21
+ __version__ = version("docdistance")
22
+ except PackageNotFoundError: # running from source, not installed
23
+ __version__ = "0.0.0"
24
+
25
+ __all__ = [
26
+ "DocDistance",
27
+ "DistanceResult",
28
+ "SourceConditionedResult",
29
+ "document_distance",
30
+ "source_conditioned_distance",
31
+ "compute_distance",
32
+ "compute_source_conditioned",
33
+ "smd",
34
+ "wcd",
35
+ "rwmd",
36
+ "closeness",
37
+ "__version__",
38
+ ]
@@ -0,0 +1,229 @@
1
+ """docdistance command-line interface.
2
+
3
+ Three subcommands - ``install`` (the only one that downloads models), ``distance`` (symmetric SMD)
4
+ and ``distance-wrt-source`` (source-conditioned). Human output is rich and coloured on a capable
5
+ terminal; ``--json`` emits machine-readable JSON and ``--result-only`` emits the bare result.
6
+ Logs go to stderr (loguru, ``--verbose`` for DEBUG), so stdout carries only the result.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from enum import Enum
12
+ import json
13
+
14
+ from rich.console import Console
15
+ from rich.panel import Panel
16
+ from rich.table import Table
17
+ import typer
18
+
19
+ from docdistance.config import configure_logging
20
+ from docdistance.distance import DEFAULT_THRESHOLD
21
+
22
+ app = typer.Typer(
23
+ rich_markup_mode="rich",
24
+ no_args_is_help=True,
25
+ add_completion=False,
26
+ help="[bold]docdistance[/bold] - semantic distance between documents via Statement Mover's Distance "
27
+ "(optimal transport over mmBERT statement embeddings).",
28
+ )
29
+
30
+ _out = Console() # stdout, for the result
31
+ _err = Console(stderr=True) # stderr, for errors
32
+
33
+
34
+ class Backend(str, Enum):
35
+ openvino = "openvino"
36
+ torch = "torch"
37
+
38
+
39
+ class InstallBackend(str, Enum):
40
+ openvino = "openvino"
41
+ torch = "torch"
42
+ both = "both"
43
+
44
+
45
+ def _version_cb(value: bool):
46
+ if value:
47
+ from docdistance import __version__
48
+
49
+ typer.echo(f"docdistance {__version__}")
50
+ raise typer.Exit()
51
+
52
+
53
+ @app.callback()
54
+ def main(
55
+ version: bool = typer.Option(
56
+ False, "--version", callback=_version_cb, is_eager=True, help="show version and exit"
57
+ ),
58
+ ):
59
+ """Semantic document distance grounded in optimal-transport theory."""
60
+
61
+
62
+ def _run(fn):
63
+ """Call ``fn`` and turn a missing-model error into a clean message + exit code 1."""
64
+ from docdistance.encoders import ModelsNotInstalled
65
+
66
+ try:
67
+ return fn()
68
+ except ModelsNotInstalled as exc:
69
+ _err.print(f"[bold red]error:[/bold red] {exc}")
70
+ raise typer.Exit(1)
71
+
72
+
73
+ def _emit_distance(r, json_out: bool, result_only: bool) -> None:
74
+ if result_only:
75
+ typer.echo(str(r.smd))
76
+ return
77
+ if json_out:
78
+ typer.echo(json.dumps(r.to_dict(), indent=2))
79
+ return
80
+ color = "green" if r.verdict == "similar" else "red"
81
+ grid = Table.grid(padding=(0, 2))
82
+ grid.add_column(style="bold cyan")
83
+ grid.add_column()
84
+ grid.add_row("SMD (distance)", f"{r.smd:.4f}")
85
+ grid.add_row("closeness", f"{r.closeness * 100:.1f}%")
86
+ grid.add_row(
87
+ "verdict", f"[{color}]{r.verdict}[/{color}] (threshold {r.threshold:.2f} closeness)"
88
+ )
89
+ grid.add_row("bounds", f"WCD {r.wcd:.4f} ≤ RWMD {r.rwmd:.4f} ≤ SMD {r.smd:.4f}")
90
+ grid.add_row("statements", f"{r.n_statements_a} vs {r.n_statements_b}")
91
+ grid.add_row("anisotropy", "on" if r.anisotropy else "off")
92
+ _out.print(
93
+ Panel(grid, title="[bold]Document distance[/bold]", border_style=color, expand=False)
94
+ )
95
+
96
+
97
+ def _emit_wrt_source(r, json_out: bool, result_only: bool) -> None:
98
+ if result_only:
99
+ typer.echo(f"{r.d_sel},{r.residual_a},{r.residual_b}")
100
+ return
101
+ if json_out:
102
+ typer.echo(json.dumps(r.to_dict(), indent=2))
103
+ return
104
+ grid = Table.grid(padding=(0, 2))
105
+ grid.add_column(style="bold cyan")
106
+ grid.add_column()
107
+ grid.add_row("D_sel (selection divergence)", f"{r.d_sel:.4f}")
108
+ grid.add_row("A → source", f"{r.residual_a:.4f} (closeness {r.closeness_a * 100:.1f}%)")
109
+ grid.add_row("B → source", f"{r.residual_b:.4f} (closeness {r.closeness_b * 100:.1f}%)")
110
+ grid.add_row(
111
+ "statements", f"A {r.n_statements_a} / B {r.n_statements_b} / S {r.n_statements_source}"
112
+ )
113
+ _out.print(
114
+ Panel(
115
+ grid,
116
+ title="[bold]Source-conditioned distance d(A,B|S)[/bold]",
117
+ border_style="cyan",
118
+ expand=False,
119
+ )
120
+ )
121
+ _out.print(
122
+ "[dim]residual = geometric distance to the source; the reranker + NLI grounding grade and "
123
+ "numeric verifier are deferred to E02[/dim]"
124
+ )
125
+
126
+
127
+ @app.command(
128
+ epilog="[bold]Examples[/bold]\n\n"
129
+ " docdistance distance report_v1.md report_v2.md\n"
130
+ ' docdistance distance "first text" "second text" --backend torch\n'
131
+ " docdistance distance a.md b.md --json\n"
132
+ " docdistance distance a.md b.md --result-only"
133
+ )
134
+ def distance(
135
+ a: str = typer.Argument(..., help="first document - a file path or raw text"),
136
+ b: str = typer.Argument(..., help="second document - a file path or raw text"),
137
+ backend: Backend = typer.Option(
138
+ Backend.openvino, "--backend", help="statement encoder backend"
139
+ ),
140
+ anisotropy: bool = typer.Option(
141
+ False,
142
+ "--anisotropy/--no-anisotropy",
143
+ help="all-but-the-top anisotropy removal - needs a corpus, off by default for a pair",
144
+ ),
145
+ threshold: float = typer.Option(
146
+ DEFAULT_THRESHOLD,
147
+ "--threshold",
148
+ help="closeness cutoff for the similar / not-similar verdict",
149
+ ),
150
+ json_out: bool = typer.Option(False, "--json", help="machine-readable JSON to stdout"),
151
+ result_only: bool = typer.Option(
152
+ False, "--result-only", help="bare SMD scalar to stdout, no clutter"
153
+ ),
154
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="DEBUG logging to stderr"),
155
+ ):
156
+ """Symmetric distance between two documents - the exact Statement Mover's Distance."""
157
+ configure_logging(verbose)
158
+ from docdistance.pipeline import document_distance
159
+
160
+ result = _run(
161
+ lambda: document_distance(
162
+ a, b, backend=backend.value, anisotropy=anisotropy, threshold=threshold
163
+ )
164
+ )
165
+ _emit_distance(result, json_out, result_only)
166
+
167
+
168
+ @app.command(
169
+ name="distance-wrt-source",
170
+ epilog="[bold]Examples[/bold]\n\n"
171
+ " docdistance distance-wrt-source summary_a.md summary_b.md --source article.md\n"
172
+ " docdistance distance-wrt-source a.md b.md -s s.md --json\n"
173
+ " docdistance distance-wrt-source a.md b.md -s s.md --result-only [dim]# D_sel,res_a,res_b[/dim]",
174
+ )
175
+ def distance_wrt_source(
176
+ a: str = typer.Argument(..., help="first document - a file path or raw text"),
177
+ b: str = typer.Argument(..., help="second document - a file path or raw text"),
178
+ source: str = typer.Option(..., "--source", "-s", help="the common source document"),
179
+ backend: Backend = typer.Option(
180
+ Backend.openvino, "--backend", help="statement encoder backend"
181
+ ),
182
+ anisotropy: bool = typer.Option(
183
+ False,
184
+ "--anisotropy/--no-anisotropy",
185
+ help="anisotropy removal - needs a corpus, off by default",
186
+ ),
187
+ json_out: bool = typer.Option(False, "--json", help="machine-readable JSON to stdout"),
188
+ result_only: bool = typer.Option(
189
+ False, "--result-only", help="bare comma-separated D_sel,residual_a,residual_b to stdout"
190
+ ),
191
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="DEBUG logging to stderr"),
192
+ ):
193
+ """Source-conditioned distance d(A, B | S) - selection divergence plus each document's distance to S."""
194
+ configure_logging(verbose)
195
+ from docdistance.pipeline import source_conditioned_distance
196
+
197
+ result = _run(
198
+ lambda: source_conditioned_distance(
199
+ a, b, source, backend=backend.value, anisotropy=anisotropy
200
+ )
201
+ )
202
+ _emit_wrt_source(result, json_out, result_only)
203
+
204
+
205
+ @app.command(
206
+ epilog="[bold]Examples[/bold]\n\n"
207
+ " docdistance install [dim]# both backends[/dim]\n"
208
+ " docdistance install --backend openvino",
209
+ )
210
+ def install(
211
+ backend: InstallBackend = typer.Option(
212
+ InstallBackend.both, "--backend", help="which encoder weights to fetch"
213
+ ),
214
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="DEBUG logging to stderr"),
215
+ ):
216
+ """Download and cache the models - the only command that fetches from the Hub (TQDM progress bars)."""
217
+ configure_logging(verbose)
218
+ from docdistance.encoders import ModelsNotInstalled, download_models
219
+
220
+ try:
221
+ backends = download_models(backend.value)
222
+ except ModelsNotInstalled as exc:
223
+ _err.print(f"[bold red]error:[/bold red] {exc}")
224
+ raise typer.Exit(1)
225
+ _out.print(f"[green]models ready:[/green] {', '.join(backends)}")
226
+
227
+
228
+ if __name__ == "__main__":
229
+ app()
@@ -0,0 +1,66 @@
1
+ from pathlib import Path
2
+ import sys
3
+
4
+ from dotenv import load_dotenv
5
+ from loguru import logger
6
+
7
+ ########### SETUP ###############
8
+
9
+ # set up logger - INFO by default (DEBUG only via the CLI --verbose flag), sink to stderr so
10
+ # stdout stays clean for --json / --result-only output
11
+ logger.remove()
12
+ logger.add(sys.stderr, colorize=True, level="INFO")
13
+
14
+ # If tqdm is installed, configure loguru with tqdm.write
15
+ # https://github.com/Delgan/loguru/issues/135
16
+ try:
17
+ from tqdm import tqdm
18
+
19
+ logger.remove()
20
+ logger.add(lambda msg: tqdm.write(msg, end="", file=sys.stderr), colorize=True, level="INFO")
21
+ except ModuleNotFoundError:
22
+ pass
23
+
24
+ ########## VARIABLES ############
25
+
26
+ # Load environment variables from .env file if it exists
27
+ load_dotenv()
28
+
29
+ # paths
30
+ PROJ_ROOT = Path(__file__).resolve().parents[2]
31
+ DATA_DIR = PROJ_ROOT / "data"
32
+ RAW_DATA_DIR = DATA_DIR / "raw"
33
+ INTERIM_DATA_DIR = DATA_DIR / "interim"
34
+ PROCESSED_DATA_DIR = DATA_DIR / "processed"
35
+ EXTERNAL_DATA_DIR = DATA_DIR / "external"
36
+ MODELS_DIR = PROJ_ROOT / "models"
37
+ REPORTS_DIR = PROJ_ROOT / "reports"
38
+ FIGURES_DIR = REPORTS_DIR / "figures"
39
+
40
+ # log current root dir (debug so it never pollutes machine-readable stdout)
41
+ logger.debug(f"PROJ_ROOT path is: {PROJ_ROOT}")
42
+
43
+ ########## MODELS ###############
44
+
45
+ # segmenter (wtpsplit SaT) and the mmBERT statement encoders, by backend
46
+ SAT_MODEL = "sat-3l-sm"
47
+ MMBERT_TORCH_MODEL = "jhu-clsp/mmBERT-base"
48
+ MMBERT_OPENVINO_LOCAL = MODELS_DIR / "02-mmbert-openvino-int8"
49
+ MMBERT_OPENVINO_HF = "stellars/mmBERT-base-openvino-int8"
50
+
51
+
52
+ def configure_logging(verbose: bool = False) -> None:
53
+ """Re-point loguru at stderr at INFO, or DEBUG when ``verbose`` - the CLI calls this first.
54
+
55
+ stderr keeps stdout reserved for the result so ``--json`` and ``--result-only`` stay machine-parseable.
56
+ """
57
+ level = "DEBUG" if verbose else "INFO"
58
+ logger.remove()
59
+ try:
60
+ from tqdm import tqdm
61
+
62
+ logger.add(
63
+ lambda msg: tqdm.write(msg, end="", file=sys.stderr), colorize=True, level=level
64
+ )
65
+ except ModuleNotFoundError:
66
+ logger.add(sys.stderr, colorize=True, level=level)
@@ -0,0 +1,29 @@
1
+ from pathlib import Path
2
+
3
+ from loguru import logger
4
+ from tqdm import tqdm
5
+ import typer
6
+
7
+ from docdistance.config import PROCESSED_DATA_DIR, RAW_DATA_DIR
8
+
9
+ app = typer.Typer()
10
+
11
+
12
+ @app.command()
13
+ def main(
14
+ # ---- REPLACE DEFAULT PATHS AS APPROPRIATE ----
15
+ input_path: Path = RAW_DATA_DIR / "dataset.csv",
16
+ output_path: Path = PROCESSED_DATA_DIR / "dataset.csv",
17
+ # ----------------------------------------------
18
+ ):
19
+ # ---- REPLACE THIS WITH YOUR OWN CODE ----
20
+ logger.info("Processing dataset...")
21
+ for i in tqdm(range(10), total=10):
22
+ if i == 5:
23
+ logger.info("Something happened for iteration 5.")
24
+ logger.success("Processing dataset complete.")
25
+ # -----------------------------------------
26
+
27
+
28
+ if __name__ == "__main__":
29
+ app()