denselinkage 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- denselinkage-1.0.0/LICENSE +21 -0
- denselinkage-1.0.0/PKG-INFO +301 -0
- denselinkage-1.0.0/README.md +249 -0
- denselinkage-1.0.0/pyproject.toml +135 -0
- denselinkage-1.0.0/src/denselinkage/__init__.py +47 -0
- denselinkage-1.0.0/src/denselinkage/_optional/__init__.py +21 -0
- denselinkage-1.0.0/src/denselinkage/_optional/require.py +29 -0
- denselinkage-1.0.0/src/denselinkage/_reader/__init__.py +25 -0
- denselinkage-1.0.0/src/denselinkage/_reader/record_reader.py +37 -0
- denselinkage-1.0.0/src/denselinkage/_store/__init__.py +12 -0
- denselinkage-1.0.0/src/denselinkage/_store/reference_store.py +134 -0
- denselinkage-1.0.0/src/denselinkage/blocking/__init__.py +14 -0
- denselinkage-1.0.0/src/denselinkage/blocking/dense_blocker.py +43 -0
- denselinkage-1.0.0/src/denselinkage/blocking/dense_blocking_index.py +88 -0
- denselinkage-1.0.0/src/denselinkage/clustering/__init__.py +19 -0
- denselinkage-1.0.0/src/denselinkage/clustering/_union_find.py +57 -0
- denselinkage-1.0.0/src/denselinkage/clustering/connected_components.py +48 -0
- denselinkage-1.0.0/src/denselinkage/clustering/connected_components_clusterer.py +13 -0
- denselinkage-1.0.0/src/denselinkage/core/__init__.py +74 -0
- denselinkage-1.0.0/src/denselinkage/core/errors.py +60 -0
- denselinkage-1.0.0/src/denselinkage/core/models.py +80 -0
- denselinkage-1.0.0/src/denselinkage/core/ports.py +202 -0
- denselinkage-1.0.0/src/denselinkage/core/results.py +170 -0
- denselinkage-1.0.0/src/denselinkage/embedding/__init__.py +13 -0
- denselinkage-1.0.0/src/denselinkage/embedding/hashed_ngram_embedder.py +54 -0
- denselinkage-1.0.0/src/denselinkage/embedding/sentence_transformer_embedder.py +61 -0
- denselinkage-1.0.0/src/denselinkage/filtering/__init__.py +14 -0
- denselinkage-1.0.0/src/denselinkage/filtering/similarity_threshold_filter.py +29 -0
- denselinkage-1.0.0/src/denselinkage/indexing/__init__.py +22 -0
- denselinkage-1.0.0/src/denselinkage/indexing/faiss_flat_index.py +34 -0
- denselinkage-1.0.0/src/denselinkage/indexing/faiss_searchable_index.py +84 -0
- denselinkage-1.0.0/src/denselinkage/indexing/numpy_flat_index.py +14 -0
- denselinkage-1.0.0/src/denselinkage/indexing/numpy_searchable_index.py +60 -0
- denselinkage-1.0.0/src/denselinkage/linkage/__init__.py +18 -0
- denselinkage-1.0.0/src/denselinkage/linkage/_assembly.py +39 -0
- denselinkage-1.0.0/src/denselinkage/linkage/candidate_frame.py +67 -0
- denselinkage-1.0.0/src/denselinkage/linkage/dense_linker.py +167 -0
- denselinkage-1.0.0/src/denselinkage/linkage/linkage_index.py +79 -0
- denselinkage-1.0.0/src/denselinkage/matching/__init__.py +13 -0
- denselinkage-1.0.0/src/denselinkage/matching/langchain_matcher.py +104 -0
- denselinkage-1.0.0/src/denselinkage/matching/retry_policy.py +9 -0
- denselinkage-1.0.0/src/denselinkage/matching/threshold_matcher.py +29 -0
- denselinkage-1.0.0/src/denselinkage/metrics/__init__.py +32 -0
- denselinkage-1.0.0/src/denselinkage/metrics/_pairing.py +11 -0
- denselinkage-1.0.0/src/denselinkage/metrics/adjusted.py +73 -0
- denselinkage-1.0.0/src/denselinkage/metrics/blocking.py +89 -0
- denselinkage-1.0.0/src/denselinkage/metrics/clustering.py +109 -0
- denselinkage-1.0.0/src/denselinkage/metrics/linkage.py +95 -0
- denselinkage-1.0.0/src/denselinkage/metrics/tuning.py +88 -0
- denselinkage-1.0.0/src/denselinkage/mining/__init__.py +9 -0
- denselinkage-1.0.0/src/denselinkage/mining/hard_negatives.py +43 -0
- denselinkage-1.0.0/src/denselinkage/py.typed +0 -0
- denselinkage-1.0.0/src/denselinkage/serializing/__init__.py +23 -0
- denselinkage-1.0.0/src/denselinkage/serializing/fieldwise_serializer.py +15 -0
- denselinkage-1.0.0/src/denselinkage/serializing/template_serializer.py +31 -0
- denselinkage-1.0.0/src/denselinkage/serializing/whole_row_serializer.py +20 -0
- denselinkage-1.0.0/src/denselinkage/training/__init__.py +11 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Alvaro
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: denselinkage
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Record linkage with dense blocking using text embeddings and LLM matching
|
|
5
|
+
Keywords: record-linkage,entity-resolution,embeddings,blocking,llm
|
|
6
|
+
Author: Alvaro
|
|
7
|
+
Author-email: Alvaro <alvarocarvalho@live.com>
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering
|
|
17
|
+
Classifier: Typing :: Typed
|
|
18
|
+
Requires-Dist: numpy>=1.24
|
|
19
|
+
Requires-Dist: pandas>=2.0
|
|
20
|
+
Requires-Dist: langchain-core>=0.3 ; extra == 'all'
|
|
21
|
+
Requires-Dist: langchain-openai>=0.2 ; extra == 'all'
|
|
22
|
+
Requires-Dist: faiss-cpu>=1.8 ; extra == 'all'
|
|
23
|
+
Requires-Dist: sentence-transformers>=3.0 ; extra == 'all'
|
|
24
|
+
Requires-Dist: mypy>=1.11 ; extra == 'dev'
|
|
25
|
+
Requires-Dist: ruff>=0.6 ; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest>=8 ; extra == 'dev'
|
|
27
|
+
Requires-Dist: pytest-cov>=5 ; extra == 'dev'
|
|
28
|
+
Requires-Dist: pandas-stubs>=2.0 ; extra == 'dev'
|
|
29
|
+
Requires-Dist: sphinx>=8 ; extra == 'docs'
|
|
30
|
+
Requires-Dist: furo>=2024.8 ; extra == 'docs'
|
|
31
|
+
Requires-Dist: myst-parser>=4 ; extra == 'docs'
|
|
32
|
+
Requires-Dist: sphinx-copybutton>=0.5 ; extra == 'docs'
|
|
33
|
+
Requires-Dist: sphinx-design>=0.6 ; extra == 'docs'
|
|
34
|
+
Requires-Dist: sphinxcontrib-mermaid>=0.9 ; extra == 'docs'
|
|
35
|
+
Requires-Dist: faiss-cpu>=1.8 ; extra == 'faiss'
|
|
36
|
+
Requires-Dist: langchain-core>=0.3 ; extra == 'langchain'
|
|
37
|
+
Requires-Dist: langchain-openai>=0.2 ; extra == 'langchain'
|
|
38
|
+
Requires-Dist: sentence-transformers>=3.0 ; extra == 'sentence-transformers'
|
|
39
|
+
Requires-Python: >=3.10
|
|
40
|
+
Project-URL: Homepage, https://github.com/caalvaro/denselinkage
|
|
41
|
+
Project-URL: Documentation, https://caalvaro.github.io/denselinkage/
|
|
42
|
+
Project-URL: Repository, https://github.com/caalvaro/denselinkage
|
|
43
|
+
Project-URL: Issues, https://github.com/caalvaro/denselinkage/issues
|
|
44
|
+
Provides-Extra: all
|
|
45
|
+
Provides-Extra: dev
|
|
46
|
+
Provides-Extra: docs
|
|
47
|
+
Provides-Extra: faiss
|
|
48
|
+
Provides-Extra: langchain
|
|
49
|
+
Provides-Extra: sentence-transformers
|
|
50
|
+
Provides-Extra: train
|
|
51
|
+
Description-Content-Type: text/markdown
|
|
52
|
+
|
|
53
|
+
# denselinkage
|
|
54
|
+
|
|
55
|
+
[](https://github.com/caalvaro/denselinkage/actions/workflows/ci.yml)
|
|
56
|
+
[](https://pypi.org/project/denselinkage/)
|
|
57
|
+
[](https://pypi.org/project/denselinkage/)
|
|
58
|
+
[](https://caalvaro.github.io/denselinkage/)
|
|
59
|
+
[](LICENSE)
|
|
60
|
+
|
|
61
|
+
**Record linkage and deduplication for Python — dense blocking, optional LLM matching, and evaluation built in.**
|
|
62
|
+
|
|
63
|
+
`denselinkage` finds the records that refer to the same real-world entity, whether
|
|
64
|
+
they live in **two datasets** (record *linkage*) or **one** (*deduplication*). It
|
|
65
|
+
shrinks the impossible all-pairs comparison down to a plausible few with
|
|
66
|
+
embedding-based **blocking**, decides each candidate with a pluggable **matcher** —
|
|
67
|
+
a fast similarity threshold or a large language model — then clusters and scores
|
|
68
|
+
the result.
|
|
69
|
+
|
|
70
|
+
The core runs on **numpy + pandas alone**. FAISS, sentence-transformers, and
|
|
71
|
+
LangChain are optional extras you reach for when you need approximate-nearest-
|
|
72
|
+
neighbour search at scale, semantic embeddings, or LLM-based matching — `import
|
|
73
|
+
denselinkage` pulls in none of them until you ask.
|
|
74
|
+
|
|
75
|
+
## Highlights
|
|
76
|
+
|
|
77
|
+
- 🪶 **Dependency-free core** — `pip install denselinkage` is just numpy + pandas.
|
|
78
|
+
The heavy ML backends are opt-in extras, and the import graph proves it: CI fails
|
|
79
|
+
if a backend ever leaks into the core.
|
|
80
|
+
- 🔁 **Swap any stage** — the embedder, vector index, and matcher are independent
|
|
81
|
+
components behind small `Protocol`s. Go from lexical → semantic, brute-force →
|
|
82
|
+
FAISS, threshold → LLM without rewriting your pipeline.
|
|
83
|
+
- 📦 **End to end** — block → match → cluster → evaluate, with linkage, blocking,
|
|
84
|
+
and clustering (B³) **metrics included**.
|
|
85
|
+
- 🧊 **Immutable by design** — `link` / `dedupe` / `match_pairs` are single calls
|
|
86
|
+
with no hidden `fit`/`predict` state. Build a reference index once and reuse it.
|
|
87
|
+
- 🧪 **Typed, tested, stable** — strict `mypy`, a shipped `py.typed` marker,
|
|
88
|
+
**100% branch coverage**, and a frozen 1.0 API (evolution is *extend, never
|
|
89
|
+
modify*).
|
|
90
|
+
|
|
91
|
+
## Installation
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
pip install denselinkage # core — numpy + pandas only
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Add extras when you need them (or `[all]` for everything):
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
pip install "denselinkage[sentence-transformers]" # semantic embeddings
|
|
101
|
+
pip install "denselinkage[faiss]" # FAISS approximate-NN index
|
|
102
|
+
pip install "denselinkage[langchain]" # LLM matcher
|
|
103
|
+
pip install "denselinkage[all]"
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Requires Python 3.10+.
|
|
107
|
+
|
|
108
|
+
## Quickstart
|
|
109
|
+
|
|
110
|
+
Link two tables of companies with messy, inconsistent names — no configuration,
|
|
111
|
+
one call:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
import pandas as pd
|
|
115
|
+
from denselinkage import DenseLinker, LabeledPairs, Source
|
|
116
|
+
from denselinkage.metrics import linkage_metrics
|
|
117
|
+
|
|
118
|
+
left = pd.DataFrame({
|
|
119
|
+
"id": ["A1", "A2", "A3"],
|
|
120
|
+
"name": ["Apple Inc", "Microsoft Corp", "Google LLC"],
|
|
121
|
+
"city": ["Cupertino", "Redmond", "Mountain View"],
|
|
122
|
+
})
|
|
123
|
+
right = pd.DataFrame({
|
|
124
|
+
"id": ["B1", "B2", "B3"],
|
|
125
|
+
"name": ["Apple Incorporated", "Microsoft", "Google"],
|
|
126
|
+
"city": ["Cupertino", "Redmond", "Mountain View"],
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
linker = DenseLinker.with_defaults() # lexical stack: embed → index → threshold
|
|
130
|
+
result = linker.link( # one call — no fit/predict, no mutation
|
|
131
|
+
Source(left, id_column="id"),
|
|
132
|
+
Source(right, id_column="id"),
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
print(result.to_frame().query("match")) # the decided matches, as a DataFrame
|
|
136
|
+
gold = LabeledPairs.from_pairs([("A1", "B1"), ("A2", "B2"), ("A3", "B3")])
|
|
137
|
+
m = linkage_metrics(result, gold=gold)
|
|
138
|
+
print(f"precision={m.precision:.2f} recall={m.recall:.2f} f1={m.f1:.2f}")
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
```text
|
|
142
|
+
left_id right_id similarity match confidence reason
|
|
143
|
+
0 A1 B1 0.762443 True None None
|
|
144
|
+
3 A2 B2 0.833908 True None None
|
|
145
|
+
6 A3 B3 0.864126 True None None
|
|
146
|
+
precision=1.00 recall=1.00 f1=1.00
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
`with_defaults()` wires the dependency-free **lexical** stack — character n-gram
|
|
150
|
+
embeddings, brute-force nearest-neighbour search, and a similarity threshold. It
|
|
151
|
+
recovers abbreviations, punctuation, and typos (`Apple Inc` ↔ `Apple Incorporated`)
|
|
152
|
+
out of the box.
|
|
153
|
+
|
|
154
|
+
## How it works
|
|
155
|
+
|
|
156
|
+
denselinkage is a four-stage pipeline, and every stage is a swappable component:
|
|
157
|
+
|
|
158
|
+
```text
|
|
159
|
+
Sources ──► Block ──────► Match ──────► Cluster ──────► Evaluate
|
|
160
|
+
(embed + (threshold (connected (P/R/F1,
|
|
161
|
+
top-k NN) or LLM) components) B³, …)
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
1. **Block** — embed each record and retrieve its top-k nearest neighbours, turning
|
|
165
|
+
an `N × M` comparison into a handful of candidate pairs.
|
|
166
|
+
2. **Match** — decide each candidate. `ThresholdMatcher` gates on similarity;
|
|
167
|
+
`LangChainMatcher` asks an LLM and returns a typed decision.
|
|
168
|
+
3. **Cluster** — group the matches into entities with transitive
|
|
169
|
+
`connected_components`.
|
|
170
|
+
4. **Evaluate** — score against gold labels with linkage, blocking, or clustering
|
|
171
|
+
(B³) metrics.
|
|
172
|
+
|
|
173
|
+
Three verbs cover the common shapes — **`link`** (two datasets), **`dedupe`** (one
|
|
174
|
+
dataset against itself), and **`match_pairs`** (you already have candidate pairs).
|
|
175
|
+
`index()` builds a reusable reference index, so you embed once and query many times.
|
|
176
|
+
|
|
177
|
+
## Scaling up: semantic + LLM matching
|
|
178
|
+
|
|
179
|
+
The lexical default is fast and free, but it only sees *characters* — it can't tell
|
|
180
|
+
that *Google* and *Alphabet* are the same company. Swap in the heavy adapters for
|
|
181
|
+
**meaning** (semantic embeddings), **scale** (FAISS), and **judgment** (an LLM), all
|
|
182
|
+
behind the same ports:
|
|
183
|
+
|
|
184
|
+
| Stage | Lexical (default) | Semantic + LLM |
|
|
185
|
+
|------:|-------------------|----------------|
|
|
186
|
+
| Embed | `HashedNGramEmbedder` | `SentenceTransformerEmbedder` · `[sentence-transformers]` |
|
|
187
|
+
| Index | `NumpyFlatIndex` | `FaissFlatIndex` · `[faiss]` |
|
|
188
|
+
| Match | `ThresholdMatcher` | `LangChainMatcher` · `[langchain]` |
|
|
189
|
+
| Catches | typos, abbreviations | + semantic renames, + judgment calls |
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from denselinkage import DenseLinker
|
|
193
|
+
from denselinkage.blocking import DenseBlocker
|
|
194
|
+
from denselinkage.embedding import SentenceTransformerEmbedder
|
|
195
|
+
from denselinkage.indexing import FaissFlatIndex
|
|
196
|
+
from denselinkage.matching import LangChainMatcher
|
|
197
|
+
from langchain_openai import ChatOpenAI
|
|
198
|
+
|
|
199
|
+
linker = DenseLinker(
|
|
200
|
+
blocker=DenseBlocker(
|
|
201
|
+
embedder=SentenceTransformerEmbedder("all-MiniLM-L6-v2"),
|
|
202
|
+
vector_index=FaissFlatIndex(),
|
|
203
|
+
top_k=5, similarity_threshold=0.6,
|
|
204
|
+
),
|
|
205
|
+
matcher=LangChainMatcher(
|
|
206
|
+
llm=ChatOpenAI(model="gpt-4o-mini", temperature=0),
|
|
207
|
+
prompt="Are these the same entity?\nA: {record_a}\nB: {record_b}",
|
|
208
|
+
),
|
|
209
|
+
)
|
|
210
|
+
result = linker.link(left, right) # the call is unchanged
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
Because the score is cosine on both stacks, a `similarity_threshold` tuned on the
|
|
214
|
+
lexical stack keeps its meaning here. See the
|
|
215
|
+
[Semantic + LLM guide](https://caalvaro.github.io/denselinkage/guide/semantic-llm.html)
|
|
216
|
+
for model selection, the prompt contract, retries, and cost.
|
|
217
|
+
|
|
218
|
+
## Deduplicate and cluster
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
from denselinkage import DenseLinker, Source, connected_components
|
|
222
|
+
|
|
223
|
+
# df: one table that may contain duplicate records, with an "id" column
|
|
224
|
+
result = DenseLinker.with_defaults().dedupe(Source(df, id_column="id"))
|
|
225
|
+
clusters = connected_components(result) # transitive grouping → entities
|
|
226
|
+
print(clusters.to_frame()) # record_id, cluster_id
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
`dedupe` links a dataset against itself and suppresses self-pairs internally.
|
|
230
|
+
Clustering is transitive (A~B, B~C ⇒ one cluster), so a noisy matcher can
|
|
231
|
+
over-merge — watch for B³ recall ≫ precision.
|
|
232
|
+
|
|
233
|
+
## Evaluation
|
|
234
|
+
|
|
235
|
+
Metrics are first-class, split by what they measure:
|
|
236
|
+
|
|
237
|
+
- **Linkage** — `linkage_metrics` → precision / recall / F1 over matched pairs
|
|
238
|
+
(undecidable pairs are surfaced as errors and counted separately, never mixed in).
|
|
239
|
+
- **Blocking** — `blocking_metrics` / `pair_completeness_at_k` → did blocking even
|
|
240
|
+
surface the true pairs?
|
|
241
|
+
- **Clustering** — `clustering_metrics` → B³ (Bagga–Baldwin) precision / recall / F1
|
|
242
|
+
over the entity clusters.
|
|
243
|
+
|
|
244
|
+
Plus `tune_threshold` for a P/R/F1 sweep and `mine_hard_negatives` for contrastive
|
|
245
|
+
training material.
|
|
246
|
+
|
|
247
|
+
## Design
|
|
248
|
+
|
|
249
|
+
denselinkage is **contract-first** (hexagonal / ports-and-adapters). Domain logic
|
|
250
|
+
talks to small `typing.Protocol`s — `Embedder`, `VectorIndex`, `Matcher`, … — and
|
|
251
|
+
concrete adapters plug in behind them. Two consequences worth knowing:
|
|
252
|
+
|
|
253
|
+
- **The dependency cut is structural.** Heavy backends import lazily, inside the
|
|
254
|
+
methods that use them; a CI job asserts `import denselinkage` pulls in no FAISS /
|
|
255
|
+
torch / LangChain.
|
|
256
|
+
- **The 1.0 contract is frozen.** Signatures and field types won't change under
|
|
257
|
+
you; the library evolves by *adding* (an optional field, a sibling type, a new
|
|
258
|
+
classmethod), never by modifying. Stateful components follow **spec → artifact**:
|
|
259
|
+
a stateless spec's `build(...)` returns an immutable, fitted artifact.
|
|
260
|
+
|
|
261
|
+
See the
|
|
262
|
+
[architecture overview](https://caalvaro.github.io/denselinkage/architecture.html)
|
|
263
|
+
for the full picture.
|
|
264
|
+
|
|
265
|
+
## Documentation
|
|
266
|
+
|
|
267
|
+
📖 **[Full documentation →](https://caalvaro.github.io/denselinkage/)**
|
|
268
|
+
|
|
269
|
+
- [Tutorial](https://caalvaro.github.io/denselinkage/getting-started/tutorial.html)
|
|
270
|
+
— link two tables stage by stage.
|
|
271
|
+
- [Semantic + LLM matching](https://caalvaro.github.io/denselinkage/guide/semantic-llm.html)
|
|
272
|
+
and [Choosing components](https://caalvaro.github.io/denselinkage/guide/choosing-components.html).
|
|
273
|
+
- [API reference](https://caalvaro.github.io/denselinkage/api/index.html).
|
|
274
|
+
|
|
275
|
+
Runnable scripts live in [`examples/`](examples/) — `00_quickstart.py` is the
|
|
276
|
+
shortest path; `01`/`02` show the full semantic + LLM assembly.
|
|
277
|
+
|
|
278
|
+
## Development
|
|
279
|
+
|
|
280
|
+
Requires [uv](https://docs.astral.sh/uv/).
|
|
281
|
+
|
|
282
|
+
```bash
|
|
283
|
+
uv sync --dev
|
|
284
|
+
uv run ruff check . && uv run ruff format --check . && uv run mypy && uv run pytest
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
CI runs lint, format, strict mypy, and the test suite on Python 3.10–3.13, with a
|
|
288
|
+
separate job for the optional adapters. See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
289
|
+
|
|
290
|
+
## Changelog
|
|
291
|
+
|
|
292
|
+
See [CHANGELOG.md](CHANGELOG.md).
|
|
293
|
+
|
|
294
|
+
## Citing
|
|
295
|
+
|
|
296
|
+
If you use denselinkage in your research, please cite it — see
|
|
297
|
+
[`CITATION.cff`](CITATION.cff).
|
|
298
|
+
|
|
299
|
+
## License
|
|
300
|
+
|
|
301
|
+
[MIT](LICENSE) © 2026 Alvaro
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
# denselinkage
|
|
2
|
+
|
|
3
|
+
[](https://github.com/caalvaro/denselinkage/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/denselinkage/)
|
|
5
|
+
[](https://pypi.org/project/denselinkage/)
|
|
6
|
+
[](https://caalvaro.github.io/denselinkage/)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
|
|
9
|
+
**Record linkage and deduplication for Python — dense blocking, optional LLM matching, and evaluation built in.**
|
|
10
|
+
|
|
11
|
+
`denselinkage` finds the records that refer to the same real-world entity, whether
|
|
12
|
+
they live in **two datasets** (record *linkage*) or **one** (*deduplication*). It
|
|
13
|
+
shrinks the impossible all-pairs comparison down to a plausible few with
|
|
14
|
+
embedding-based **blocking**, decides each candidate with a pluggable **matcher** —
|
|
15
|
+
a fast similarity threshold or a large language model — then clusters and scores
|
|
16
|
+
the result.
|
|
17
|
+
|
|
18
|
+
The core runs on **numpy + pandas alone**. FAISS, sentence-transformers, and
|
|
19
|
+
LangChain are optional extras you reach for when you need approximate-nearest-
|
|
20
|
+
neighbour search at scale, semantic embeddings, or LLM-based matching — `import
|
|
21
|
+
denselinkage` pulls in none of them until you ask.
|
|
22
|
+
|
|
23
|
+
## Highlights
|
|
24
|
+
|
|
25
|
+
- 🪶 **Dependency-free core** — `pip install denselinkage` is just numpy + pandas.
|
|
26
|
+
The heavy ML backends are opt-in extras, and the import graph proves it: CI fails
|
|
27
|
+
if a backend ever leaks into the core.
|
|
28
|
+
- 🔁 **Swap any stage** — the embedder, vector index, and matcher are independent
|
|
29
|
+
components behind small `Protocol`s. Go from lexical → semantic, brute-force →
|
|
30
|
+
FAISS, threshold → LLM without rewriting your pipeline.
|
|
31
|
+
- 📦 **End to end** — block → match → cluster → evaluate, with linkage, blocking,
|
|
32
|
+
and clustering (B³) **metrics included**.
|
|
33
|
+
- 🧊 **Immutable by design** — `link` / `dedupe` / `match_pairs` are single calls
|
|
34
|
+
with no hidden `fit`/`predict` state. Build a reference index once and reuse it.
|
|
35
|
+
- 🧪 **Typed, tested, stable** — strict `mypy`, a shipped `py.typed` marker,
|
|
36
|
+
**100% branch coverage**, and a frozen 1.0 API (evolution is *extend, never
|
|
37
|
+
modify*).
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install denselinkage # core — numpy + pandas only
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Add extras when you need them (or `[all]` for everything):
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install "denselinkage[sentence-transformers]" # semantic embeddings
|
|
49
|
+
pip install "denselinkage[faiss]" # FAISS approximate-NN index
|
|
50
|
+
pip install "denselinkage[langchain]" # LLM matcher
|
|
51
|
+
pip install "denselinkage[all]"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Requires Python 3.10+.
|
|
55
|
+
|
|
56
|
+
## Quickstart
|
|
57
|
+
|
|
58
|
+
Link two tables of companies with messy, inconsistent names — no configuration,
|
|
59
|
+
one call:
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
import pandas as pd
|
|
63
|
+
from denselinkage import DenseLinker, LabeledPairs, Source
|
|
64
|
+
from denselinkage.metrics import linkage_metrics
|
|
65
|
+
|
|
66
|
+
left = pd.DataFrame({
|
|
67
|
+
"id": ["A1", "A2", "A3"],
|
|
68
|
+
"name": ["Apple Inc", "Microsoft Corp", "Google LLC"],
|
|
69
|
+
"city": ["Cupertino", "Redmond", "Mountain View"],
|
|
70
|
+
})
|
|
71
|
+
right = pd.DataFrame({
|
|
72
|
+
"id": ["B1", "B2", "B3"],
|
|
73
|
+
"name": ["Apple Incorporated", "Microsoft", "Google"],
|
|
74
|
+
"city": ["Cupertino", "Redmond", "Mountain View"],
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
linker = DenseLinker.with_defaults() # lexical stack: embed → index → threshold
|
|
78
|
+
result = linker.link( # one call — no fit/predict, no mutation
|
|
79
|
+
Source(left, id_column="id"),
|
|
80
|
+
Source(right, id_column="id"),
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
print(result.to_frame().query("match")) # the decided matches, as a DataFrame
|
|
84
|
+
gold = LabeledPairs.from_pairs([("A1", "B1"), ("A2", "B2"), ("A3", "B3")])
|
|
85
|
+
m = linkage_metrics(result, gold=gold)
|
|
86
|
+
print(f"precision={m.precision:.2f} recall={m.recall:.2f} f1={m.f1:.2f}")
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
```text
|
|
90
|
+
left_id right_id similarity match confidence reason
|
|
91
|
+
0 A1 B1 0.762443 True None None
|
|
92
|
+
3 A2 B2 0.833908 True None None
|
|
93
|
+
6 A3 B3 0.864126 True None None
|
|
94
|
+
precision=1.00 recall=1.00 f1=1.00
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
`with_defaults()` wires the dependency-free **lexical** stack — character n-gram
|
|
98
|
+
embeddings, brute-force nearest-neighbour search, and a similarity threshold. It
|
|
99
|
+
recovers abbreviations, punctuation, and typos (`Apple Inc` ↔ `Apple Incorporated`)
|
|
100
|
+
out of the box.
|
|
101
|
+
|
|
102
|
+
## How it works
|
|
103
|
+
|
|
104
|
+
denselinkage is a four-stage pipeline, and every stage is a swappable component:
|
|
105
|
+
|
|
106
|
+
```text
|
|
107
|
+
Sources ──► Block ──────► Match ──────► Cluster ──────► Evaluate
|
|
108
|
+
(embed + (threshold (connected (P/R/F1,
|
|
109
|
+
top-k NN) or LLM) components) B³, …)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
1. **Block** — embed each record and retrieve its top-k nearest neighbours, turning
|
|
113
|
+
an `N × M` comparison into a handful of candidate pairs.
|
|
114
|
+
2. **Match** — decide each candidate. `ThresholdMatcher` gates on similarity;
|
|
115
|
+
`LangChainMatcher` asks an LLM and returns a typed decision.
|
|
116
|
+
3. **Cluster** — group the matches into entities with transitive
|
|
117
|
+
`connected_components`.
|
|
118
|
+
4. **Evaluate** — score against gold labels with linkage, blocking, or clustering
|
|
119
|
+
(B³) metrics.
|
|
120
|
+
|
|
121
|
+
Three verbs cover the common shapes — **`link`** (two datasets), **`dedupe`** (one
|
|
122
|
+
dataset against itself), and **`match_pairs`** (you already have candidate pairs).
|
|
123
|
+
`index()` builds a reusable reference index, so you embed once and query many times.
|
|
124
|
+
|
|
125
|
+
## Scaling up: semantic + LLM matching
|
|
126
|
+
|
|
127
|
+
The lexical default is fast and free, but it only sees *characters* — it can't tell
|
|
128
|
+
that *Google* and *Alphabet* are the same company. Swap in the heavy adapters for
|
|
129
|
+
**meaning** (semantic embeddings), **scale** (FAISS), and **judgment** (an LLM), all
|
|
130
|
+
behind the same ports:
|
|
131
|
+
|
|
132
|
+
| Stage | Lexical (default) | Semantic + LLM |
|
|
133
|
+
|------:|-------------------|----------------|
|
|
134
|
+
| Embed | `HashedNGramEmbedder` | `SentenceTransformerEmbedder` · `[sentence-transformers]` |
|
|
135
|
+
| Index | `NumpyFlatIndex` | `FaissFlatIndex` · `[faiss]` |
|
|
136
|
+
| Match | `ThresholdMatcher` | `LangChainMatcher` · `[langchain]` |
|
|
137
|
+
| Catches | typos, abbreviations | + semantic renames, + judgment calls |
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from denselinkage import DenseLinker
|
|
141
|
+
from denselinkage.blocking import DenseBlocker
|
|
142
|
+
from denselinkage.embedding import SentenceTransformerEmbedder
|
|
143
|
+
from denselinkage.indexing import FaissFlatIndex
|
|
144
|
+
from denselinkage.matching import LangChainMatcher
|
|
145
|
+
from langchain_openai import ChatOpenAI
|
|
146
|
+
|
|
147
|
+
linker = DenseLinker(
|
|
148
|
+
blocker=DenseBlocker(
|
|
149
|
+
embedder=SentenceTransformerEmbedder("all-MiniLM-L6-v2"),
|
|
150
|
+
vector_index=FaissFlatIndex(),
|
|
151
|
+
top_k=5, similarity_threshold=0.6,
|
|
152
|
+
),
|
|
153
|
+
matcher=LangChainMatcher(
|
|
154
|
+
llm=ChatOpenAI(model="gpt-4o-mini", temperature=0),
|
|
155
|
+
prompt="Are these the same entity?\nA: {record_a}\nB: {record_b}",
|
|
156
|
+
),
|
|
157
|
+
)
|
|
158
|
+
result = linker.link(left, right) # the call is unchanged
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Because the score is cosine on both stacks, a `similarity_threshold` tuned on the
|
|
162
|
+
lexical stack keeps its meaning here. See the
|
|
163
|
+
[Semantic + LLM guide](https://caalvaro.github.io/denselinkage/guide/semantic-llm.html)
|
|
164
|
+
for model selection, the prompt contract, retries, and cost.
|
|
165
|
+
|
|
166
|
+
## Deduplicate and cluster
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
from denselinkage import DenseLinker, Source, connected_components
|
|
170
|
+
|
|
171
|
+
# df: one table that may contain duplicate records, with an "id" column
|
|
172
|
+
result = DenseLinker.with_defaults().dedupe(Source(df, id_column="id"))
|
|
173
|
+
clusters = connected_components(result) # transitive grouping → entities
|
|
174
|
+
print(clusters.to_frame()) # record_id, cluster_id
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
`dedupe` links a dataset against itself and suppresses self-pairs internally.
|
|
178
|
+
Clustering is transitive (A~B, B~C ⇒ one cluster), so a noisy matcher can
|
|
179
|
+
over-merge — watch for B³ recall ≫ precision.
|
|
180
|
+
|
|
181
|
+
## Evaluation
|
|
182
|
+
|
|
183
|
+
Metrics are first-class, split by what they measure:
|
|
184
|
+
|
|
185
|
+
- **Linkage** — `linkage_metrics` → precision / recall / F1 over matched pairs
|
|
186
|
+
(undecidable pairs are surfaced as errors and counted separately, never mixed in).
|
|
187
|
+
- **Blocking** — `blocking_metrics` / `pair_completeness_at_k` → did blocking even
|
|
188
|
+
surface the true pairs?
|
|
189
|
+
- **Clustering** — `clustering_metrics` → B³ (Bagga–Baldwin) precision / recall / F1
|
|
190
|
+
over the entity clusters.
|
|
191
|
+
|
|
192
|
+
Plus `tune_threshold` for a P/R/F1 sweep and `mine_hard_negatives` for contrastive
|
|
193
|
+
training material.
|
|
194
|
+
|
|
195
|
+
## Design
|
|
196
|
+
|
|
197
|
+
denselinkage is **contract-first** (hexagonal / ports-and-adapters). Domain logic
|
|
198
|
+
talks to small `typing.Protocol`s — `Embedder`, `VectorIndex`, `Matcher`, … — and
|
|
199
|
+
concrete adapters plug in behind them. Two consequences worth knowing:
|
|
200
|
+
|
|
201
|
+
- **The dependency cut is structural.** Heavy backends import lazily, inside the
|
|
202
|
+
methods that use them; a CI job asserts `import denselinkage` pulls in no FAISS /
|
|
203
|
+
torch / LangChain.
|
|
204
|
+
- **The 1.0 contract is frozen.** Signatures and field types won't change under
|
|
205
|
+
you; the library evolves by *adding* (an optional field, a sibling type, a new
|
|
206
|
+
classmethod), never by modifying. Stateful components follow **spec → artifact**:
|
|
207
|
+
a stateless spec's `build(...)` returns an immutable, fitted artifact.
|
|
208
|
+
|
|
209
|
+
See the
|
|
210
|
+
[architecture overview](https://caalvaro.github.io/denselinkage/architecture.html)
|
|
211
|
+
for the full picture.
|
|
212
|
+
|
|
213
|
+
## Documentation
|
|
214
|
+
|
|
215
|
+
📖 **[Full documentation →](https://caalvaro.github.io/denselinkage/)**
|
|
216
|
+
|
|
217
|
+
- [Tutorial](https://caalvaro.github.io/denselinkage/getting-started/tutorial.html)
|
|
218
|
+
— link two tables stage by stage.
|
|
219
|
+
- [Semantic + LLM matching](https://caalvaro.github.io/denselinkage/guide/semantic-llm.html)
|
|
220
|
+
and [Choosing components](https://caalvaro.github.io/denselinkage/guide/choosing-components.html).
|
|
221
|
+
- [API reference](https://caalvaro.github.io/denselinkage/api/index.html).
|
|
222
|
+
|
|
223
|
+
Runnable scripts live in [`examples/`](examples/) — `00_quickstart.py` is the
|
|
224
|
+
shortest path; `01`/`02` show the full semantic + LLM assembly.
|
|
225
|
+
|
|
226
|
+
## Development
|
|
227
|
+
|
|
228
|
+
Requires [uv](https://docs.astral.sh/uv/).
|
|
229
|
+
|
|
230
|
+
```bash
|
|
231
|
+
uv sync --dev
|
|
232
|
+
uv run ruff check . && uv run ruff format --check . && uv run mypy && uv run pytest
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
CI runs lint, format, strict mypy, and the test suite on Python 3.10–3.13, with a
|
|
236
|
+
separate job for the optional adapters. See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
237
|
+
|
|
238
|
+
## Changelog
|
|
239
|
+
|
|
240
|
+
See [CHANGELOG.md](CHANGELOG.md).
|
|
241
|
+
|
|
242
|
+
## Citing
|
|
243
|
+
|
|
244
|
+
If you use denselinkage in your research, please cite it — see
|
|
245
|
+
[`CITATION.cff`](CITATION.cff).
|
|
246
|
+
|
|
247
|
+
## License
|
|
248
|
+
|
|
249
|
+
[MIT](LICENSE) © 2026 Alvaro
|