denselinkage 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. denselinkage-1.0.0/LICENSE +21 -0
  2. denselinkage-1.0.0/PKG-INFO +301 -0
  3. denselinkage-1.0.0/README.md +249 -0
  4. denselinkage-1.0.0/pyproject.toml +135 -0
  5. denselinkage-1.0.0/src/denselinkage/__init__.py +47 -0
  6. denselinkage-1.0.0/src/denselinkage/_optional/__init__.py +21 -0
  7. denselinkage-1.0.0/src/denselinkage/_optional/require.py +29 -0
  8. denselinkage-1.0.0/src/denselinkage/_reader/__init__.py +25 -0
  9. denselinkage-1.0.0/src/denselinkage/_reader/record_reader.py +37 -0
  10. denselinkage-1.0.0/src/denselinkage/_store/__init__.py +12 -0
  11. denselinkage-1.0.0/src/denselinkage/_store/reference_store.py +134 -0
  12. denselinkage-1.0.0/src/denselinkage/blocking/__init__.py +14 -0
  13. denselinkage-1.0.0/src/denselinkage/blocking/dense_blocker.py +43 -0
  14. denselinkage-1.0.0/src/denselinkage/blocking/dense_blocking_index.py +88 -0
  15. denselinkage-1.0.0/src/denselinkage/clustering/__init__.py +19 -0
  16. denselinkage-1.0.0/src/denselinkage/clustering/_union_find.py +57 -0
  17. denselinkage-1.0.0/src/denselinkage/clustering/connected_components.py +48 -0
  18. denselinkage-1.0.0/src/denselinkage/clustering/connected_components_clusterer.py +13 -0
  19. denselinkage-1.0.0/src/denselinkage/core/__init__.py +74 -0
  20. denselinkage-1.0.0/src/denselinkage/core/errors.py +60 -0
  21. denselinkage-1.0.0/src/denselinkage/core/models.py +80 -0
  22. denselinkage-1.0.0/src/denselinkage/core/ports.py +202 -0
  23. denselinkage-1.0.0/src/denselinkage/core/results.py +170 -0
  24. denselinkage-1.0.0/src/denselinkage/embedding/__init__.py +13 -0
  25. denselinkage-1.0.0/src/denselinkage/embedding/hashed_ngram_embedder.py +54 -0
  26. denselinkage-1.0.0/src/denselinkage/embedding/sentence_transformer_embedder.py +61 -0
  27. denselinkage-1.0.0/src/denselinkage/filtering/__init__.py +14 -0
  28. denselinkage-1.0.0/src/denselinkage/filtering/similarity_threshold_filter.py +29 -0
  29. denselinkage-1.0.0/src/denselinkage/indexing/__init__.py +22 -0
  30. denselinkage-1.0.0/src/denselinkage/indexing/faiss_flat_index.py +34 -0
  31. denselinkage-1.0.0/src/denselinkage/indexing/faiss_searchable_index.py +84 -0
  32. denselinkage-1.0.0/src/denselinkage/indexing/numpy_flat_index.py +14 -0
  33. denselinkage-1.0.0/src/denselinkage/indexing/numpy_searchable_index.py +60 -0
  34. denselinkage-1.0.0/src/denselinkage/linkage/__init__.py +18 -0
  35. denselinkage-1.0.0/src/denselinkage/linkage/_assembly.py +39 -0
  36. denselinkage-1.0.0/src/denselinkage/linkage/candidate_frame.py +67 -0
  37. denselinkage-1.0.0/src/denselinkage/linkage/dense_linker.py +167 -0
  38. denselinkage-1.0.0/src/denselinkage/linkage/linkage_index.py +79 -0
  39. denselinkage-1.0.0/src/denselinkage/matching/__init__.py +13 -0
  40. denselinkage-1.0.0/src/denselinkage/matching/langchain_matcher.py +104 -0
  41. denselinkage-1.0.0/src/denselinkage/matching/retry_policy.py +9 -0
  42. denselinkage-1.0.0/src/denselinkage/matching/threshold_matcher.py +29 -0
  43. denselinkage-1.0.0/src/denselinkage/metrics/__init__.py +32 -0
  44. denselinkage-1.0.0/src/denselinkage/metrics/_pairing.py +11 -0
  45. denselinkage-1.0.0/src/denselinkage/metrics/adjusted.py +73 -0
  46. denselinkage-1.0.0/src/denselinkage/metrics/blocking.py +89 -0
  47. denselinkage-1.0.0/src/denselinkage/metrics/clustering.py +109 -0
  48. denselinkage-1.0.0/src/denselinkage/metrics/linkage.py +95 -0
  49. denselinkage-1.0.0/src/denselinkage/metrics/tuning.py +88 -0
  50. denselinkage-1.0.0/src/denselinkage/mining/__init__.py +9 -0
  51. denselinkage-1.0.0/src/denselinkage/mining/hard_negatives.py +43 -0
  52. denselinkage-1.0.0/src/denselinkage/py.typed +0 -0
  53. denselinkage-1.0.0/src/denselinkage/serializing/__init__.py +23 -0
  54. denselinkage-1.0.0/src/denselinkage/serializing/fieldwise_serializer.py +15 -0
  55. denselinkage-1.0.0/src/denselinkage/serializing/template_serializer.py +31 -0
  56. denselinkage-1.0.0/src/denselinkage/serializing/whole_row_serializer.py +20 -0
  57. denselinkage-1.0.0/src/denselinkage/training/__init__.py +11 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Alvaro
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,301 @@
1
+ Metadata-Version: 2.4
2
+ Name: denselinkage
3
+ Version: 1.0.0
4
+ Summary: Record linkage with dense blocking using text embeddings and LLM matching
5
+ Keywords: record-linkage,entity-resolution,embeddings,blocking,llm
6
+ Author: Alvaro
7
+ Author-email: Alvaro <alvarocarvalho@live.com>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Scientific/Engineering
17
+ Classifier: Typing :: Typed
18
+ Requires-Dist: numpy>=1.24
19
+ Requires-Dist: pandas>=2.0
20
+ Requires-Dist: langchain-core>=0.3 ; extra == 'all'
21
+ Requires-Dist: langchain-openai>=0.2 ; extra == 'all'
22
+ Requires-Dist: faiss-cpu>=1.8 ; extra == 'all'
23
+ Requires-Dist: sentence-transformers>=3.0 ; extra == 'all'
24
+ Requires-Dist: mypy>=1.11 ; extra == 'dev'
25
+ Requires-Dist: ruff>=0.6 ; extra == 'dev'
26
+ Requires-Dist: pytest>=8 ; extra == 'dev'
27
+ Requires-Dist: pytest-cov>=5 ; extra == 'dev'
28
+ Requires-Dist: pandas-stubs>=2.0 ; extra == 'dev'
29
+ Requires-Dist: sphinx>=8 ; extra == 'docs'
30
+ Requires-Dist: furo>=2024.8 ; extra == 'docs'
31
+ Requires-Dist: myst-parser>=4 ; extra == 'docs'
32
+ Requires-Dist: sphinx-copybutton>=0.5 ; extra == 'docs'
33
+ Requires-Dist: sphinx-design>=0.6 ; extra == 'docs'
34
+ Requires-Dist: sphinxcontrib-mermaid>=0.9 ; extra == 'docs'
35
+ Requires-Dist: faiss-cpu>=1.8 ; extra == 'faiss'
36
+ Requires-Dist: langchain-core>=0.3 ; extra == 'langchain'
37
+ Requires-Dist: langchain-openai>=0.2 ; extra == 'langchain'
38
+ Requires-Dist: sentence-transformers>=3.0 ; extra == 'sentence-transformers'
39
+ Requires-Python: >=3.10
40
+ Project-URL: Homepage, https://github.com/caalvaro/denselinkage
41
+ Project-URL: Documentation, https://caalvaro.github.io/denselinkage/
42
+ Project-URL: Repository, https://github.com/caalvaro/denselinkage
43
+ Project-URL: Issues, https://github.com/caalvaro/denselinkage/issues
44
+ Provides-Extra: all
45
+ Provides-Extra: dev
46
+ Provides-Extra: docs
47
+ Provides-Extra: faiss
48
+ Provides-Extra: langchain
49
+ Provides-Extra: sentence-transformers
50
+ Provides-Extra: train
51
+ Description-Content-Type: text/markdown
52
+
53
+ # denselinkage
54
+
55
+ [![CI](https://github.com/caalvaro/denselinkage/actions/workflows/ci.yml/badge.svg)](https://github.com/caalvaro/denselinkage/actions/workflows/ci.yml)
56
+ [![PyPI](https://img.shields.io/pypi/v/denselinkage.svg)](https://pypi.org/project/denselinkage/)
57
+ [![Python versions](https://img.shields.io/pypi/pyversions/denselinkage.svg)](https://pypi.org/project/denselinkage/)
58
+ [![Docs](https://img.shields.io/badge/docs-GitHub%20Pages-blue.svg)](https://caalvaro.github.io/denselinkage/)
59
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
60
+
61
+ **Record linkage and deduplication for Python — dense blocking, optional LLM matching, and evaluation built in.**
62
+
63
+ `denselinkage` finds the records that refer to the same real-world entity, whether
64
+ they live in **two datasets** (record *linkage*) or **one** (*deduplication*). It
65
+ shrinks the impossible all-pairs comparison down to a plausible few with
66
+ embedding-based **blocking**, decides each candidate with a pluggable **matcher** —
67
+ a fast similarity threshold or a large language model — then clusters and scores
68
+ the result.
69
+
70
+ The core runs on **numpy + pandas alone**. FAISS, sentence-transformers, and
71
+ LangChain are optional extras you reach for when you need approximate-nearest-
72
+ neighbour search at scale, semantic embeddings, or LLM-based matching — `import
73
+ denselinkage` pulls in none of them until you ask.
74
+
75
+ ## Highlights
76
+
77
+ - 🪶 **Dependency-free core** — `pip install denselinkage` is just numpy + pandas.
78
+ The heavy ML backends are opt-in extras, and the import graph proves it: CI fails
79
+ if a backend ever leaks into the core.
80
+ - 🔁 **Swap any stage** — the embedder, vector index, and matcher are independent
81
+ components behind small `Protocol`s. Go from lexical → semantic, brute-force →
82
+ FAISS, threshold → LLM without rewriting your pipeline.
83
+ - 📦 **End to end** — block → match → cluster → evaluate, with linkage, blocking,
84
+ and clustering (B³) **metrics included**.
85
+ - 🧊 **Immutable by design** — `link` / `dedupe` / `match_pairs` are single calls
86
+ with no hidden `fit`/`predict` state. Build a reference index once and reuse it.
87
+ - 🧪 **Typed, tested, stable** — strict `mypy`, a shipped `py.typed` marker,
88
+ **100% branch coverage**, and a frozen 1.0 API (evolution is *extend, never
89
+ modify*).
90
+
91
+ ## Installation
92
+
93
+ ```bash
94
+ pip install denselinkage # core — numpy + pandas only
95
+ ```
96
+
97
+ Add extras when you need them (or `[all]` for everything):
98
+
99
+ ```bash
100
+ pip install "denselinkage[sentence-transformers]" # semantic embeddings
101
+ pip install "denselinkage[faiss]" # FAISS approximate-NN index
102
+ pip install "denselinkage[langchain]" # LLM matcher
103
+ pip install "denselinkage[all]"
104
+ ```
105
+
106
+ Requires Python 3.10+.
107
+
108
+ ## Quickstart
109
+
110
+ Link two tables of companies with messy, inconsistent names — no configuration,
111
+ one call:
112
+
113
+ ```python
114
+ import pandas as pd
115
+ from denselinkage import DenseLinker, LabeledPairs, Source
116
+ from denselinkage.metrics import linkage_metrics
117
+
118
+ left = pd.DataFrame({
119
+ "id": ["A1", "A2", "A3"],
120
+ "name": ["Apple Inc", "Microsoft Corp", "Google LLC"],
121
+ "city": ["Cupertino", "Redmond", "Mountain View"],
122
+ })
123
+ right = pd.DataFrame({
124
+ "id": ["B1", "B2", "B3"],
125
+ "name": ["Apple Incorporated", "Microsoft", "Google"],
126
+ "city": ["Cupertino", "Redmond", "Mountain View"],
127
+ })
128
+
129
+ linker = DenseLinker.with_defaults() # lexical stack: embed → index → threshold
130
+ result = linker.link( # one call — no fit/predict, no mutation
131
+ Source(left, id_column="id"),
132
+ Source(right, id_column="id"),
133
+ )
134
+
135
+ print(result.to_frame().query("match")) # the decided matches, as a DataFrame
136
+ gold = LabeledPairs.from_pairs([("A1", "B1"), ("A2", "B2"), ("A3", "B3")])
137
+ m = linkage_metrics(result, gold=gold)
138
+ print(f"precision={m.precision:.2f} recall={m.recall:.2f} f1={m.f1:.2f}")
139
+ ```
140
+
141
+ ```text
142
+ left_id right_id similarity match confidence reason
143
+ 0 A1 B1 0.762443 True None None
144
+ 3 A2 B2 0.833908 True None None
145
+ 6 A3 B3 0.864126 True None None
146
+ precision=1.00 recall=1.00 f1=1.00
147
+ ```
148
+
149
+ `with_defaults()` wires the dependency-free **lexical** stack — character n-gram
150
+ embeddings, brute-force nearest-neighbour search, and a similarity threshold. It
151
+ recovers abbreviations, punctuation, and typos (`Apple Inc` ↔ `Apple Incorporated`)
152
+ out of the box.
153
+
154
+ ## How it works
155
+
156
+ denselinkage is a four-stage pipeline, and every stage is a swappable component:
157
+
158
+ ```text
159
+ Sources ──► Block ──────► Match ──────► Cluster ──────► Evaluate
160
+ (embed + (threshold (connected (P/R/F1,
161
+ top-k NN) or LLM) components) B³, …)
162
+ ```
163
+
164
+ 1. **Block** — embed each record and retrieve its top-k nearest neighbours, turning
165
+ an `N × M` comparison into a handful of candidate pairs.
166
+ 2. **Match** — decide each candidate. `ThresholdMatcher` gates on similarity;
167
+ `LangChainMatcher` asks an LLM and returns a typed decision.
168
+ 3. **Cluster** — group the matches into entities with transitive
169
+ `connected_components`.
170
+ 4. **Evaluate** — score against gold labels with linkage, blocking, or clustering
171
+ (B³) metrics.
172
+
173
+ Three verbs cover the common shapes — **`link`** (two datasets), **`dedupe`** (one
174
+ dataset against itself), and **`match_pairs`** (you already have candidate pairs).
175
+ `index()` builds a reusable reference index, so you embed once and query many times.
176
+
177
+ ## Scaling up: semantic + LLM matching
178
+
179
+ The lexical default is fast and free, but it only sees *characters* — it can't tell
180
+ that *Google* and *Alphabet* are the same company. Swap in the heavy adapters for
181
+ **meaning** (semantic embeddings), **scale** (FAISS), and **judgment** (an LLM), all
182
+ behind the same ports:
183
+
184
+ | Stage | Lexical (default) | Semantic + LLM |
185
+ |------:|-------------------|----------------|
186
+ | Embed | `HashedNGramEmbedder` | `SentenceTransformerEmbedder` · `[sentence-transformers]` |
187
+ | Index | `NumpyFlatIndex` | `FaissFlatIndex` · `[faiss]` |
188
+ | Match | `ThresholdMatcher` | `LangChainMatcher` · `[langchain]` |
189
+ | Catches | typos, abbreviations | + semantic renames, + judgment calls |
190
+
191
+ ```python
192
+ from denselinkage import DenseLinker
193
+ from denselinkage.blocking import DenseBlocker
194
+ from denselinkage.embedding import SentenceTransformerEmbedder
195
+ from denselinkage.indexing import FaissFlatIndex
196
+ from denselinkage.matching import LangChainMatcher
197
+ from langchain_openai import ChatOpenAI
198
+
199
+ linker = DenseLinker(
200
+ blocker=DenseBlocker(
201
+ embedder=SentenceTransformerEmbedder("all-MiniLM-L6-v2"),
202
+ vector_index=FaissFlatIndex(),
203
+ top_k=5, similarity_threshold=0.6,
204
+ ),
205
+ matcher=LangChainMatcher(
206
+ llm=ChatOpenAI(model="gpt-4o-mini", temperature=0),
207
+ prompt="Are these the same entity?\nA: {record_a}\nB: {record_b}",
208
+ ),
209
+ )
210
+ result = linker.link(left, right) # the call is unchanged
211
+ ```
212
+
213
+ Because the score is cosine on both stacks, a `similarity_threshold` tuned on the
214
+ lexical stack keeps its meaning here. See the
215
+ [Semantic + LLM guide](https://caalvaro.github.io/denselinkage/guide/semantic-llm.html)
216
+ for model selection, the prompt contract, retries, and cost.
217
+
218
+ ## Deduplicate and cluster
219
+
220
+ ```python
221
+ from denselinkage import DenseLinker, Source, connected_components
222
+
223
+ # df: one table that may contain duplicate records, with an "id" column
224
+ result = DenseLinker.with_defaults().dedupe(Source(df, id_column="id"))
225
+ clusters = connected_components(result) # transitive grouping → entities
226
+ print(clusters.to_frame()) # record_id, cluster_id
227
+ ```
228
+
229
+ `dedupe` links a dataset against itself and suppresses self-pairs internally.
230
+ Clustering is transitive (A~B, B~C ⇒ one cluster), so a noisy matcher can
231
+ over-merge — watch for B³ recall ≫ precision.
232
+
233
+ ## Evaluation
234
+
235
+ Metrics are first-class, split by what they measure:
236
+
237
+ - **Linkage** — `linkage_metrics` → precision / recall / F1 over matched pairs
238
+ (undecidable pairs are surfaced as errors and counted separately, never mixed in).
239
+ - **Blocking** — `blocking_metrics` / `pair_completeness_at_k` → did blocking even
240
+ surface the true pairs?
241
+ - **Clustering** — `clustering_metrics` → B³ (Bagga–Baldwin) precision / recall / F1
242
+ over the entity clusters.
243
+
244
+ Plus `tune_threshold` for a P/R/F1 sweep and `mine_hard_negatives` for contrastive
245
+ training material.
246
+
247
+ ## Design
248
+
249
+ denselinkage is **contract-first** (hexagonal / ports-and-adapters). Domain logic
250
+ talks to small `typing.Protocol`s — `Embedder`, `VectorIndex`, `Matcher`, … — and
251
+ concrete adapters plug in behind them. Two consequences worth knowing:
252
+
253
+ - **The dependency cut is structural.** Heavy backends import lazily, inside the
254
+ methods that use them; a CI job asserts `import denselinkage` pulls in no FAISS /
255
+ torch / LangChain.
256
+ - **The 1.0 contract is frozen.** Signatures and field types won't change under
257
+ you; the library evolves by *adding* (an optional field, a sibling type, a new
258
+ classmethod), never by modifying. Stateful components follow **spec → artifact**:
259
+ a stateless spec's `build(...)` returns an immutable, fitted artifact.
260
+
261
+ See the
262
+ [architecture overview](https://caalvaro.github.io/denselinkage/architecture.html)
263
+ for the full picture.
264
+
265
+ ## Documentation
266
+
267
+ 📖 **[Full documentation →](https://caalvaro.github.io/denselinkage/)**
268
+
269
+ - [Tutorial](https://caalvaro.github.io/denselinkage/getting-started/tutorial.html)
270
+ — link two tables stage by stage.
271
+ - [Semantic + LLM matching](https://caalvaro.github.io/denselinkage/guide/semantic-llm.html)
272
+ and [Choosing components](https://caalvaro.github.io/denselinkage/guide/choosing-components.html).
273
+ - [API reference](https://caalvaro.github.io/denselinkage/api/index.html).
274
+
275
+ Runnable scripts live in [`examples/`](examples/) — `00_quickstart.py` is the
276
+ shortest path; `01`/`02` show the full semantic + LLM assembly.
277
+
278
+ ## Development
279
+
280
+ Requires [uv](https://docs.astral.sh/uv/).
281
+
282
+ ```bash
283
+ uv sync --dev
284
+ uv run ruff check . && uv run ruff format --check . && uv run mypy && uv run pytest
285
+ ```
286
+
287
+ CI runs lint, format, strict mypy, and the test suite on Python 3.10–3.13, with a
288
+ separate job for the optional adapters. See [CONTRIBUTING.md](CONTRIBUTING.md).
289
+
290
+ ## Changelog
291
+
292
+ See [CHANGELOG.md](CHANGELOG.md).
293
+
294
+ ## Citing
295
+
296
+ If you use denselinkage in your research, please cite it — see
297
+ [`CITATION.cff`](CITATION.cff).
298
+
299
+ ## License
300
+
301
+ [MIT](LICENSE) © 2026 Alvaro
@@ -0,0 +1,249 @@
1
+ # denselinkage
2
+
3
+ [![CI](https://github.com/caalvaro/denselinkage/actions/workflows/ci.yml/badge.svg)](https://github.com/caalvaro/denselinkage/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/denselinkage.svg)](https://pypi.org/project/denselinkage/)
5
+ [![Python versions](https://img.shields.io/pypi/pyversions/denselinkage.svg)](https://pypi.org/project/denselinkage/)
6
+ [![Docs](https://img.shields.io/badge/docs-GitHub%20Pages-blue.svg)](https://caalvaro.github.io/denselinkage/)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
8
+
9
+ **Record linkage and deduplication for Python — dense blocking, optional LLM matching, and evaluation built in.**
10
+
11
+ `denselinkage` finds the records that refer to the same real-world entity, whether
12
+ they live in **two datasets** (record *linkage*) or **one** (*deduplication*). It
13
+ shrinks the impossible all-pairs comparison down to a plausible few with
14
+ embedding-based **blocking**, decides each candidate with a pluggable **matcher** —
15
+ a fast similarity threshold or a large language model — then clusters and scores
16
+ the result.
17
+
18
+ The core runs on **numpy + pandas alone**. FAISS, sentence-transformers, and
19
+ LangChain are optional extras you reach for when you need approximate-nearest-
20
+ neighbour search at scale, semantic embeddings, or LLM-based matching — `import
21
+ denselinkage` pulls in none of them until you ask.
22
+
23
+ ## Highlights
24
+
25
+ - 🪶 **Dependency-free core** — `pip install denselinkage` is just numpy + pandas.
26
+ The heavy ML backends are opt-in extras, and the import graph proves it: CI fails
27
+ if a backend ever leaks into the core.
28
+ - 🔁 **Swap any stage** — the embedder, vector index, and matcher are independent
29
+ components behind small `Protocol`s. Go from lexical → semantic, brute-force →
30
+ FAISS, threshold → LLM without rewriting your pipeline.
31
+ - 📦 **End to end** — block → match → cluster → evaluate, with linkage, blocking,
32
+ and clustering (B³) **metrics included**.
33
+ - 🧊 **Immutable by design** — `link` / `dedupe` / `match_pairs` are single calls
34
+ with no hidden `fit`/`predict` state. Build a reference index once and reuse it.
35
+ - 🧪 **Typed, tested, stable** — strict `mypy`, a shipped `py.typed` marker,
36
+ **100% branch coverage**, and a frozen 1.0 API (evolution is *extend, never
37
+ modify*).
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ pip install denselinkage # core — numpy + pandas only
43
+ ```
44
+
45
+ Add extras when you need them (or `[all]` for everything):
46
+
47
+ ```bash
48
+ pip install "denselinkage[sentence-transformers]" # semantic embeddings
49
+ pip install "denselinkage[faiss]" # FAISS approximate-NN index
50
+ pip install "denselinkage[langchain]" # LLM matcher
51
+ pip install "denselinkage[all]"
52
+ ```
53
+
54
+ Requires Python 3.10+.
55
+
56
+ ## Quickstart
57
+
58
+ Link two tables of companies with messy, inconsistent names — no configuration,
59
+ one call:
60
+
61
+ ```python
62
+ import pandas as pd
63
+ from denselinkage import DenseLinker, LabeledPairs, Source
64
+ from denselinkage.metrics import linkage_metrics
65
+
66
+ left = pd.DataFrame({
67
+ "id": ["A1", "A2", "A3"],
68
+ "name": ["Apple Inc", "Microsoft Corp", "Google LLC"],
69
+ "city": ["Cupertino", "Redmond", "Mountain View"],
70
+ })
71
+ right = pd.DataFrame({
72
+ "id": ["B1", "B2", "B3"],
73
+ "name": ["Apple Incorporated", "Microsoft", "Google"],
74
+ "city": ["Cupertino", "Redmond", "Mountain View"],
75
+ })
76
+
77
+ linker = DenseLinker.with_defaults() # lexical stack: embed → index → threshold
78
+ result = linker.link( # one call — no fit/predict, no mutation
79
+ Source(left, id_column="id"),
80
+ Source(right, id_column="id"),
81
+ )
82
+
83
+ print(result.to_frame().query("match")) # the decided matches, as a DataFrame
84
+ gold = LabeledPairs.from_pairs([("A1", "B1"), ("A2", "B2"), ("A3", "B3")])
85
+ m = linkage_metrics(result, gold=gold)
86
+ print(f"precision={m.precision:.2f} recall={m.recall:.2f} f1={m.f1:.2f}")
87
+ ```
88
+
89
+ ```text
90
+ left_id right_id similarity match confidence reason
91
+ 0 A1 B1 0.762443 True None None
92
+ 3 A2 B2 0.833908 True None None
93
+ 6 A3 B3 0.864126 True None None
94
+ precision=1.00 recall=1.00 f1=1.00
95
+ ```
96
+
97
+ `with_defaults()` wires the dependency-free **lexical** stack — character n-gram
98
+ embeddings, brute-force nearest-neighbour search, and a similarity threshold. It
99
+ recovers abbreviations, punctuation, and typos (`Apple Inc` ↔ `Apple Incorporated`)
100
+ out of the box.
101
+
102
+ ## How it works
103
+
104
+ denselinkage is a four-stage pipeline, and every stage is a swappable component:
105
+
106
+ ```text
107
+ Sources ──► Block ──────► Match ──────► Cluster ──────► Evaluate
108
+ (embed + (threshold (connected (P/R/F1,
109
+ top-k NN) or LLM) components) B³, …)
110
+ ```
111
+
112
+ 1. **Block** — embed each record and retrieve its top-k nearest neighbours, turning
113
+ an `N × M` comparison into a handful of candidate pairs.
114
+ 2. **Match** — decide each candidate. `ThresholdMatcher` gates on similarity;
115
+ `LangChainMatcher` asks an LLM and returns a typed decision.
116
+ 3. **Cluster** — group the matches into entities with transitive
117
+ `connected_components`.
118
+ 4. **Evaluate** — score against gold labels with linkage, blocking, or clustering
119
+ (B³) metrics.
120
+
121
+ Three verbs cover the common shapes — **`link`** (two datasets), **`dedupe`** (one
122
+ dataset against itself), and **`match_pairs`** (you already have candidate pairs).
123
+ `index()` builds a reusable reference index, so you embed once and query many times.
124
+
125
+ ## Scaling up: semantic + LLM matching
126
+
127
+ The lexical default is fast and free, but it only sees *characters* — it can't tell
128
+ that *Google* and *Alphabet* are the same company. Swap in the heavy adapters for
129
+ **meaning** (semantic embeddings), **scale** (FAISS), and **judgment** (an LLM), all
130
+ behind the same ports:
131
+
132
+ | Stage | Lexical (default) | Semantic + LLM |
133
+ |------:|-------------------|----------------|
134
+ | Embed | `HashedNGramEmbedder` | `SentenceTransformerEmbedder` · `[sentence-transformers]` |
135
+ | Index | `NumpyFlatIndex` | `FaissFlatIndex` · `[faiss]` |
136
+ | Match | `ThresholdMatcher` | `LangChainMatcher` · `[langchain]` |
137
+ | Catches | typos, abbreviations | + semantic renames, + judgment calls |
138
+
139
+ ```python
140
+ from denselinkage import DenseLinker
141
+ from denselinkage.blocking import DenseBlocker
142
+ from denselinkage.embedding import SentenceTransformerEmbedder
143
+ from denselinkage.indexing import FaissFlatIndex
144
+ from denselinkage.matching import LangChainMatcher
145
+ from langchain_openai import ChatOpenAI
146
+
147
+ linker = DenseLinker(
148
+ blocker=DenseBlocker(
149
+ embedder=SentenceTransformerEmbedder("all-MiniLM-L6-v2"),
150
+ vector_index=FaissFlatIndex(),
151
+ top_k=5, similarity_threshold=0.6,
152
+ ),
153
+ matcher=LangChainMatcher(
154
+ llm=ChatOpenAI(model="gpt-4o-mini", temperature=0),
155
+ prompt="Are these the same entity?\nA: {record_a}\nB: {record_b}",
156
+ ),
157
+ )
158
+ result = linker.link(left, right) # the call is unchanged
159
+ ```
160
+
161
+ Because the score is cosine on both stacks, a `similarity_threshold` tuned on the
162
+ lexical stack keeps its meaning here. See the
163
+ [Semantic + LLM guide](https://caalvaro.github.io/denselinkage/guide/semantic-llm.html)
164
+ for model selection, the prompt contract, retries, and cost.
165
+
166
+ ## Deduplicate and cluster
167
+
168
+ ```python
169
+ from denselinkage import DenseLinker, Source, connected_components
170
+
171
+ # df: one table that may contain duplicate records, with an "id" column
172
+ result = DenseLinker.with_defaults().dedupe(Source(df, id_column="id"))
173
+ clusters = connected_components(result) # transitive grouping → entities
174
+ print(clusters.to_frame()) # record_id, cluster_id
175
+ ```
176
+
177
+ `dedupe` links a dataset against itself and suppresses self-pairs internally.
178
+ Clustering is transitive (A~B, B~C ⇒ one cluster), so a noisy matcher can
179
+ over-merge — watch for B³ recall ≫ precision.
180
+
181
+ ## Evaluation
182
+
183
+ Metrics are first-class, split by what they measure:
184
+
185
+ - **Linkage** — `linkage_metrics` → precision / recall / F1 over matched pairs
186
+ (undecidable pairs are surfaced as errors and counted separately, never mixed in).
187
+ - **Blocking** — `blocking_metrics` / `pair_completeness_at_k` → did blocking even
188
+ surface the true pairs?
189
+ - **Clustering** — `clustering_metrics` → B³ (Bagga–Baldwin) precision / recall / F1
190
+ over the entity clusters.
191
+
192
+ Plus `tune_threshold` for a P/R/F1 sweep and `mine_hard_negatives` for contrastive
193
+ training material.
194
+
195
+ ## Design
196
+
197
+ denselinkage is **contract-first** (hexagonal / ports-and-adapters). Domain logic
198
+ talks to small `typing.Protocol`s — `Embedder`, `VectorIndex`, `Matcher`, … — and
199
+ concrete adapters plug in behind them. Two consequences worth knowing:
200
+
201
+ - **The dependency cut is structural.** Heavy backends import lazily, inside the
202
+ methods that use them; a CI job asserts `import denselinkage` pulls in no FAISS /
203
+ torch / LangChain.
204
+ - **The 1.0 contract is frozen.** Signatures and field types won't change under
205
+ you; the library evolves by *adding* (an optional field, a sibling type, a new
206
+ classmethod), never by modifying. Stateful components follow **spec → artifact**:
207
+ a stateless spec's `build(...)` returns an immutable, fitted artifact.
208
+
209
+ See the
210
+ [architecture overview](https://caalvaro.github.io/denselinkage/architecture.html)
211
+ for the full picture.
212
+
213
+ ## Documentation
214
+
215
+ 📖 **[Full documentation →](https://caalvaro.github.io/denselinkage/)**
216
+
217
+ - [Tutorial](https://caalvaro.github.io/denselinkage/getting-started/tutorial.html)
218
+ — link two tables stage by stage.
219
+ - [Semantic + LLM matching](https://caalvaro.github.io/denselinkage/guide/semantic-llm.html)
220
+ and [Choosing components](https://caalvaro.github.io/denselinkage/guide/choosing-components.html).
221
+ - [API reference](https://caalvaro.github.io/denselinkage/api/index.html).
222
+
223
+ Runnable scripts live in [`examples/`](examples/) — `00_quickstart.py` is the
224
+ shortest path; `01`/`02` show the full semantic + LLM assembly.
225
+
226
+ ## Development
227
+
228
+ Requires [uv](https://docs.astral.sh/uv/).
229
+
230
+ ```bash
231
+ uv sync --dev
232
+ uv run ruff check . && uv run ruff format --check . && uv run mypy && uv run pytest
233
+ ```
234
+
235
+ CI runs lint, format, strict mypy, and the test suite on Python 3.10–3.13, with a
236
+ separate job for the optional adapters. See [CONTRIBUTING.md](CONTRIBUTING.md).
237
+
238
+ ## Changelog
239
+
240
+ See [CHANGELOG.md](CHANGELOG.md).
241
+
242
+ ## Citing
243
+
244
+ If you use denselinkage in your research, please cite it — see
245
+ [`CITATION.cff`](CITATION.cff).
246
+
247
+ ## License
248
+
249
+ [MIT](LICENSE) © 2026 Alvaro