llm-join 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_join/__init__.py +4 -0
- llm_join/config.py +52 -0
- llm_join/join.py +129 -0
- llm_join/merger.py +64 -0
- llm_join/prompts.py +17 -0
- llm_join/retriever.py +68 -0
- llm_join/scorer.py +159 -0
- llm_join-0.2.0.dist-info/METADATA +631 -0
- llm_join-0.2.0.dist-info/RECORD +10 -0
- llm_join-0.2.0.dist-info/WHEEL +4 -0
llm_join/__init__.py
ADDED
llm_join/config.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Callable, Optional
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class ColumnConfig:
|
|
9
|
+
left_col: str
|
|
10
|
+
right_col: str
|
|
11
|
+
embed_fn: Callable[[list[str]], np.ndarray]
|
|
12
|
+
context: str = ""
|
|
13
|
+
column_context: dict[str, str] = field(default_factory=dict)
|
|
14
|
+
top_k: int = 5
|
|
15
|
+
threshold: float = 0.7
|
|
16
|
+
batch_size: int = 32
|
|
17
|
+
embed_threshold: Optional[float] = None
|
|
18
|
+
max_llm_calls: Optional[int] = None
|
|
19
|
+
max_retries: int = 3
|
|
20
|
+
|
|
21
|
+
def __post_init__(self):
|
|
22
|
+
if not self.left_col:
|
|
23
|
+
raise ValueError("left_col must not be empty")
|
|
24
|
+
if not self.right_col:
|
|
25
|
+
raise ValueError("right_col must not be empty")
|
|
26
|
+
if not self.context or not self.context.strip():
|
|
27
|
+
raise ValueError(
|
|
28
|
+
"context must not be empty — describe what the columns represent and what kind of match to make. "
|
|
29
|
+
"Example: \"pharmaceutical drug names — match generic INN names to US brand names\""
|
|
30
|
+
)
|
|
31
|
+
if not 0.0 < self.threshold <= 1.0:
|
|
32
|
+
raise ValueError(f"threshold must be in (0, 1], got {self.threshold}")
|
|
33
|
+
if self.top_k < 1:
|
|
34
|
+
raise ValueError(f"top_k must be >= 1, got {self.top_k}")
|
|
35
|
+
if self.batch_size < 1:
|
|
36
|
+
raise ValueError(f"batch_size must be >= 1, got {self.batch_size}")
|
|
37
|
+
if self.embed_threshold is not None and not 0.0 < self.embed_threshold <= 1.0:
|
|
38
|
+
raise ValueError(f"embed_threshold must be in (0, 1], got {self.embed_threshold}")
|
|
39
|
+
if self.max_llm_calls is not None and self.max_llm_calls < 1:
|
|
40
|
+
raise ValueError(f"max_llm_calls must be >= 1, got {self.max_llm_calls}")
|
|
41
|
+
if self.max_retries < 0:
|
|
42
|
+
raise ValueError(f"max_retries must be >= 0, got {self.max_retries}")
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def context_str(self) -> str:
|
|
46
|
+
parts = []
|
|
47
|
+
if self.context:
|
|
48
|
+
parts.append(self.context)
|
|
49
|
+
for col in (self.left_col, self.right_col):
|
|
50
|
+
if col in self.column_context:
|
|
51
|
+
parts.append(f"{col}: {self.column_context[col]}")
|
|
52
|
+
return ". ".join(parts)
|
llm_join/join.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from typing import Callable, Optional, Union
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from llm_join.config import ColumnConfig
|
|
6
|
+
from llm_join.retriever import EmbeddingRetriever
|
|
7
|
+
from llm_join.scorer import LLMScorer
|
|
8
|
+
from llm_join.merger import Merger, MatchResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def fuzzy_join(
|
|
12
|
+
df1: pd.DataFrame,
|
|
13
|
+
df2: pd.DataFrame,
|
|
14
|
+
*,
|
|
15
|
+
left_on: Union[str, list[str]],
|
|
16
|
+
right_on: Union[str, list[str]],
|
|
17
|
+
llm: Callable,
|
|
18
|
+
embed_fn: Callable,
|
|
19
|
+
context: str,
|
|
20
|
+
column_context: Optional[dict] = None,
|
|
21
|
+
top_k: int = 5,
|
|
22
|
+
threshold: float = 0.7,
|
|
23
|
+
how: str = "inner",
|
|
24
|
+
batch_size: int = 32,
|
|
25
|
+
embed_threshold: Optional[float] = None,
|
|
26
|
+
max_llm_calls: Optional[int] = None,
|
|
27
|
+
max_retries: int = 3,
|
|
28
|
+
return_reasoning: bool = False,
|
|
29
|
+
) -> pd.DataFrame:
|
|
30
|
+
# Normalise column names to single string (multi-col join concatenates values)
|
|
31
|
+
left_col, right_col, df1, df2 = _normalise_cols(df1, df2, left_on, right_on)
|
|
32
|
+
|
|
33
|
+
cfg = ColumnConfig(
|
|
34
|
+
left_col=left_col,
|
|
35
|
+
right_col=right_col,
|
|
36
|
+
embed_fn=embed_fn,
|
|
37
|
+
context=context,
|
|
38
|
+
column_context=column_context or {},
|
|
39
|
+
top_k=top_k,
|
|
40
|
+
threshold=threshold,
|
|
41
|
+
batch_size=batch_size,
|
|
42
|
+
embed_threshold=embed_threshold,
|
|
43
|
+
max_llm_calls=max_llm_calls,
|
|
44
|
+
max_retries=max_retries,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
retriever = EmbeddingRetriever(embed_fn=cfg.embed_fn)
|
|
48
|
+
scorer = LLMScorer(llm, max_retries=cfg.max_retries)
|
|
49
|
+
merger = Merger()
|
|
50
|
+
|
|
51
|
+
left_vals = df1[left_col].astype(str).tolist()
|
|
52
|
+
right_vals = df2[right_col].astype(str).tolist()
|
|
53
|
+
|
|
54
|
+
# Use retrieve_with_scores for embed_threshold path
|
|
55
|
+
candidates_per_row = retriever.retrieve_with_scores(left_vals, right_vals, top_k=cfg.top_k)
|
|
56
|
+
|
|
57
|
+
matches: list[MatchResult] = []
|
|
58
|
+
llm_call_count = 0
|
|
59
|
+
|
|
60
|
+
for left_val, candidates_with_scores in zip(left_vals, candidates_per_row):
|
|
61
|
+
if not candidates_with_scores:
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
# embed_threshold short-circuit
|
|
65
|
+
if cfg.embed_threshold is not None:
|
|
66
|
+
best_candidate, best_score = candidates_with_scores[0] # already sorted by score desc
|
|
67
|
+
if best_score >= cfg.embed_threshold:
|
|
68
|
+
matches.append(MatchResult(
|
|
69
|
+
left_val=left_val,
|
|
70
|
+
right_val=best_candidate,
|
|
71
|
+
score=best_score,
|
|
72
|
+
reasoning="skipped — embed score above threshold",
|
|
73
|
+
embed_rank=0,
|
|
74
|
+
match_method="embed_threshold",
|
|
75
|
+
))
|
|
76
|
+
continue
|
|
77
|
+
if best_score < (1.0 - cfg.embed_threshold):
|
|
78
|
+
warnings.warn(
|
|
79
|
+
f"Row '{left_val}' skipped: top embed score {best_score:.3f} "
|
|
80
|
+
f"below non-match threshold {1.0 - cfg.embed_threshold:.3f}",
|
|
81
|
+
UserWarning,
|
|
82
|
+
stacklevel=2,
|
|
83
|
+
)
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
# max_llm_calls cap
|
|
87
|
+
if cfg.max_llm_calls is not None and llm_call_count >= cfg.max_llm_calls:
|
|
88
|
+
warnings.warn(
|
|
89
|
+
f"max_llm_calls={cfg.max_llm_calls} reached. "
|
|
90
|
+
"Remaining rows skipped. Result is partial.",
|
|
91
|
+
UserWarning,
|
|
92
|
+
stacklevel=2,
|
|
93
|
+
)
|
|
94
|
+
break
|
|
95
|
+
|
|
96
|
+
candidates = [c for c, _ in candidates_with_scores]
|
|
97
|
+
results = scorer.score(left_val, candidates, cfg.context_str, threshold=cfg.threshold)
|
|
98
|
+
llm_call_count += 1
|
|
99
|
+
if results is None:
|
|
100
|
+
# LLM failed all retries — fall back to highest-scoring embed candidate
|
|
101
|
+
best_candidate, best_embed_score = candidates_with_scores[0]
|
|
102
|
+
matches.append(MatchResult(
|
|
103
|
+
left_val=left_val,
|
|
104
|
+
right_val=best_candidate,
|
|
105
|
+
score=best_embed_score,
|
|
106
|
+
reasoning="LLM failed — embed rank-0 fallback used",
|
|
107
|
+
embed_rank=0,
|
|
108
|
+
match_method="embed_fallback",
|
|
109
|
+
))
|
|
110
|
+
elif results:
|
|
111
|
+
matches.extend(results)
|
|
112
|
+
|
|
113
|
+
return merger.merge(df1, df2, left_col, right_col, matches, how=how, return_reasoning=return_reasoning)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _normalise_cols(df1, df2, left_on, right_on):
|
|
117
|
+
if isinstance(left_on, list):
|
|
118
|
+
col = "__left_key__"
|
|
119
|
+
df1 = df1.copy()
|
|
120
|
+
df1[col] = df1[left_on].astype(str).agg(" ".join, axis=1)
|
|
121
|
+
left_on = col
|
|
122
|
+
if isinstance(right_on, list):
|
|
123
|
+
col = "__right_key__"
|
|
124
|
+
df2 = df2.copy()
|
|
125
|
+
df2[col] = df2[right_on].astype(str).agg(" ".join, axis=1)
|
|
126
|
+
right_on = col
|
|
127
|
+
return left_on, right_on, df1, df2
|
|
128
|
+
|
|
129
|
+
|
llm_join/merger.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class MatchResult:
|
|
7
|
+
left_val: str
|
|
8
|
+
right_val: str
|
|
9
|
+
score: float
|
|
10
|
+
reasoning: str
|
|
11
|
+
embed_rank: int = 0
|
|
12
|
+
match_method: str = "llm" # "llm" or "embed_threshold"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Merger:
|
|
16
|
+
def merge(
|
|
17
|
+
self,
|
|
18
|
+
df1: pd.DataFrame,
|
|
19
|
+
df2: pd.DataFrame,
|
|
20
|
+
left_col: str,
|
|
21
|
+
right_col: str,
|
|
22
|
+
matches: list[MatchResult],
|
|
23
|
+
how: str = "inner",
|
|
24
|
+
return_reasoning: bool = False,
|
|
25
|
+
) -> pd.DataFrame:
|
|
26
|
+
if how not in ("inner", "left", "right", "outer"):
|
|
27
|
+
raise ValueError(f"how must be inner/left/right/outer, got '{how}'")
|
|
28
|
+
if right_col in df1.columns and right_col != left_col:
|
|
29
|
+
raise ValueError(
|
|
30
|
+
f"right_col '{right_col}' already exists in df1; rename before merging"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
if not matches:
|
|
34
|
+
empty = pd.DataFrame(columns=list(df1.columns) + [
|
|
35
|
+
c for c in df2.columns if c != right_col or right_col == left_col
|
|
36
|
+
])
|
|
37
|
+
if how == "inner":
|
|
38
|
+
return empty
|
|
39
|
+
if how == "left":
|
|
40
|
+
return pd.merge(df1, df2.iloc[:0], left_on=left_col, right_on=right_col, how="left")
|
|
41
|
+
if how == "right":
|
|
42
|
+
return pd.merge(df1.iloc[:0], df2, left_on=left_col, right_on=right_col, how="right")
|
|
43
|
+
# outer: return both frames with NaN fills
|
|
44
|
+
return pd.merge(df1, df2, left_on=left_col, right_on=right_col, how="outer")
|
|
45
|
+
|
|
46
|
+
match_df = pd.DataFrame({
|
|
47
|
+
left_col: [m.left_val for m in matches],
|
|
48
|
+
right_col: [m.right_val for m in matches],
|
|
49
|
+
"_llm_score": [m.score for m in matches],
|
|
50
|
+
"_llm_reasoning": [m.reasoning for m in matches],
|
|
51
|
+
"_embed_rank": [m.embed_rank for m in matches],
|
|
52
|
+
"_match_method": [m.match_method for m in matches],
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
df2_with_key = df2.merge(match_df, on=right_col, how="left")
|
|
56
|
+
result = df1.merge(df2_with_key, left_on=left_col, right_on=left_col, how=how)
|
|
57
|
+
|
|
58
|
+
if not return_reasoning:
|
|
59
|
+
result = result.drop(
|
|
60
|
+
columns=["_llm_score", "_llm_reasoning", "_embed_rank", "_match_method"],
|
|
61
|
+
errors="ignore",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return result.reset_index(drop=True)
|
llm_join/prompts.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
_TEMPLATE = """\
|
|
2
|
+
You are a data matching assistant.
|
|
3
|
+
{context_line}For each pair below, score how likely LEFT matches RIGHT (0.0–1.0).
|
|
4
|
+
Respond ONLY as a JSON array with no extra text:
|
|
5
|
+
[{{"index": 0, "score": 0.95, "reasoning": "brief explanation"}}, ...]
|
|
6
|
+
|
|
7
|
+
Pairs:
|
|
8
|
+
{pairs}"""
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def build_prompt(left_val: str, candidates: list[str], context_str: str) -> str:
|
|
12
|
+
context_line = f"Context: {context_str}\n" if context_str.strip() else ""
|
|
13
|
+
pairs = "\n".join(
|
|
14
|
+
f'{i}. LEFT: "{left_val}" | RIGHT: "{c}"'
|
|
15
|
+
for i, c in enumerate(candidates)
|
|
16
|
+
)
|
|
17
|
+
return _TEMPLATE.format(context_line=context_line, pairs=pairs)
|
llm_join/retriever.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from typing import Callable
|
|
2
|
+
import faiss
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class EmbeddingRetriever:
|
|
7
|
+
def __init__(self, embed_fn: Callable[[list[str]], np.ndarray]):
|
|
8
|
+
self._embed_fn = embed_fn
|
|
9
|
+
|
|
10
|
+
def _embed(self, texts: list[str]) -> np.ndarray:
|
|
11
|
+
arr = self._embed_fn(texts)
|
|
12
|
+
return np.array(arr, dtype="float32")
|
|
13
|
+
|
|
14
|
+
def retrieve(
|
|
15
|
+
self,
|
|
16
|
+
query_vals: list[str],
|
|
17
|
+
corpus_vals: list[str],
|
|
18
|
+
top_k: int = 5,
|
|
19
|
+
) -> list[list[str]]:
|
|
20
|
+
if not corpus_vals:
|
|
21
|
+
return [[] for _ in query_vals]
|
|
22
|
+
|
|
23
|
+
top_k = min(top_k, len(corpus_vals))
|
|
24
|
+
corpus_vecs = self._embed(corpus_vals)
|
|
25
|
+
query_vecs = self._embed(query_vals)
|
|
26
|
+
|
|
27
|
+
# L2-normalize for cosine similarity via inner product
|
|
28
|
+
faiss.normalize_L2(corpus_vecs)
|
|
29
|
+
faiss.normalize_L2(query_vecs)
|
|
30
|
+
|
|
31
|
+
dim = corpus_vecs.shape[1]
|
|
32
|
+
index = faiss.IndexFlatIP(dim)
|
|
33
|
+
index.add(corpus_vecs)
|
|
34
|
+
|
|
35
|
+
_, indices = index.search(query_vecs, top_k)
|
|
36
|
+
|
|
37
|
+
return [
|
|
38
|
+
[corpus_vals[i] for i in row if i >= 0]
|
|
39
|
+
for row in indices
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
def retrieve_with_scores(
|
|
43
|
+
self,
|
|
44
|
+
query_vals: list[str],
|
|
45
|
+
corpus_vals: list[str],
|
|
46
|
+
top_k: int = 5,
|
|
47
|
+
) -> list[list[tuple[str, float]]]:
|
|
48
|
+
"""Returns list of (candidate, cosine_score) per query, sorted by score desc."""
|
|
49
|
+
if not corpus_vals:
|
|
50
|
+
return [[] for _ in query_vals]
|
|
51
|
+
|
|
52
|
+
top_k = min(top_k, len(corpus_vals))
|
|
53
|
+
corpus_vecs = self._embed(corpus_vals)
|
|
54
|
+
query_vecs = self._embed(query_vals)
|
|
55
|
+
|
|
56
|
+
faiss.normalize_L2(corpus_vecs)
|
|
57
|
+
faiss.normalize_L2(query_vecs)
|
|
58
|
+
|
|
59
|
+
dim = corpus_vecs.shape[1]
|
|
60
|
+
index = faiss.IndexFlatIP(dim)
|
|
61
|
+
index.add(corpus_vecs)
|
|
62
|
+
|
|
63
|
+
scores, indices = index.search(query_vecs, top_k)
|
|
64
|
+
|
|
65
|
+
return [
|
|
66
|
+
[(corpus_vals[idx], float(score)) for idx, score in zip(row_indices, row_scores) if idx >= 0]
|
|
67
|
+
for row_indices, row_scores in zip(indices, scores)
|
|
68
|
+
]
|
llm_join/scorer.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
import time
|
|
5
|
+
import warnings
|
|
6
|
+
from typing import Callable, Optional
|
|
7
|
+
|
|
8
|
+
from llm_join.merger import MatchResult
|
|
9
|
+
from llm_join.prompts import build_prompt
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LLMScorer:
|
|
13
|
+
def __init__(self, llm: Callable, max_retries: int = 3):
|
|
14
|
+
self._llm = llm
|
|
15
|
+
self._is_async = asyncio.iscoroutinefunction(llm)
|
|
16
|
+
self._max_retries = max_retries
|
|
17
|
+
|
|
18
|
+
def score(
|
|
19
|
+
self,
|
|
20
|
+
left_val: str,
|
|
21
|
+
candidates: list[str],
|
|
22
|
+
context_str: str,
|
|
23
|
+
threshold: float = 0.7,
|
|
24
|
+
) -> Optional[list[MatchResult]]:
|
|
25
|
+
if self._is_async:
|
|
26
|
+
raise TypeError(
|
|
27
|
+
"LLM is async; call score_async() instead, or pass a sync callable."
|
|
28
|
+
)
|
|
29
|
+
prompt = build_prompt(left_val, candidates, context_str)
|
|
30
|
+
last_exc: Optional[Exception] = None
|
|
31
|
+
for attempt in range(self._max_retries + 1):
|
|
32
|
+
try:
|
|
33
|
+
raw = self._llm(prompt)
|
|
34
|
+
return self._parse(left_val, candidates, raw, threshold)
|
|
35
|
+
except Exception as exc:
|
|
36
|
+
last_exc = exc
|
|
37
|
+
if attempt < self._max_retries:
|
|
38
|
+
wait = 2 ** attempt # 1s, 2s, 4s, ...
|
|
39
|
+
warnings.warn(
|
|
40
|
+
f"LLM call failed for '{left_val}' (attempt {attempt + 1}/{self._max_retries + 1}): "
|
|
41
|
+
f"{exc!r}. Retrying in {wait}s.",
|
|
42
|
+
UserWarning,
|
|
43
|
+
stacklevel=2,
|
|
44
|
+
)
|
|
45
|
+
time.sleep(wait)
|
|
46
|
+
warnings.warn(
|
|
47
|
+
f"LLM call failed for '{left_val}' after {self._max_retries + 1} attempts: {last_exc!r}. "
|
|
48
|
+
"Falling back to top embed candidate.",
|
|
49
|
+
UserWarning,
|
|
50
|
+
stacklevel=2,
|
|
51
|
+
)
|
|
52
|
+
return None # signals LLM failure — caller applies embed fallback
|
|
53
|
+
|
|
54
|
+
async def score_async(
|
|
55
|
+
self,
|
|
56
|
+
left_val: str,
|
|
57
|
+
candidates: list[str],
|
|
58
|
+
context_str: str,
|
|
59
|
+
threshold: float = 0.7,
|
|
60
|
+
) -> Optional[list[MatchResult]]:
|
|
61
|
+
prompt = build_prompt(left_val, candidates, context_str)
|
|
62
|
+
last_exc: Optional[Exception] = None
|
|
63
|
+
for attempt in range(self._max_retries + 1):
|
|
64
|
+
try:
|
|
65
|
+
if self._is_async:
|
|
66
|
+
raw = await self._llm(prompt)
|
|
67
|
+
else:
|
|
68
|
+
raw = self._llm(prompt)
|
|
69
|
+
return self._parse(left_val, candidates, raw, threshold)
|
|
70
|
+
except Exception as exc:
|
|
71
|
+
last_exc = exc
|
|
72
|
+
if attempt < self._max_retries:
|
|
73
|
+
wait = 2 ** attempt
|
|
74
|
+
warnings.warn(
|
|
75
|
+
f"LLM call failed for '{left_val}' (attempt {attempt + 1}/{self._max_retries + 1}): "
|
|
76
|
+
f"{exc!r}. Retrying in {wait}s.",
|
|
77
|
+
UserWarning,
|
|
78
|
+
stacklevel=2,
|
|
79
|
+
)
|
|
80
|
+
await asyncio.sleep(wait)
|
|
81
|
+
warnings.warn(
|
|
82
|
+
f"LLM call failed for '{left_val}' after {self._max_retries + 1} attempts: {last_exc!r}. "
|
|
83
|
+
"Falling back to top embed candidate.",
|
|
84
|
+
UserWarning,
|
|
85
|
+
stacklevel=2,
|
|
86
|
+
)
|
|
87
|
+
return None # signals LLM failure — caller applies embed fallback
|
|
88
|
+
|
|
89
|
+
def _parse(
|
|
90
|
+
self,
|
|
91
|
+
left_val: str,
|
|
92
|
+
candidates: list[str],
|
|
93
|
+
raw: str,
|
|
94
|
+
threshold: float,
|
|
95
|
+
) -> list[MatchResult]:
|
|
96
|
+
try:
|
|
97
|
+
# strip markdown code fences if present
|
|
98
|
+
cleaned = (
|
|
99
|
+
raw.strip()
|
|
100
|
+
.removeprefix("```json")
|
|
101
|
+
.removeprefix("```")
|
|
102
|
+
.removesuffix("```")
|
|
103
|
+
.strip()
|
|
104
|
+
)
|
|
105
|
+
parsed = json.loads(cleaned)
|
|
106
|
+
except (json.JSONDecodeError, ValueError):
|
|
107
|
+
# retry: find JSON array anywhere in response
|
|
108
|
+
match = re.search(r'\[.*?\]', raw, re.DOTALL)
|
|
109
|
+
if match:
|
|
110
|
+
try:
|
|
111
|
+
parsed = json.loads(match.group())
|
|
112
|
+
except json.JSONDecodeError:
|
|
113
|
+
warnings.warn(f"LLM returned malformed JSON for '{left_val}': {raw!r}")
|
|
114
|
+
return []
|
|
115
|
+
else:
|
|
116
|
+
warnings.warn(f"LLM returned malformed JSON for '{left_val}': {raw!r}")
|
|
117
|
+
return []
|
|
118
|
+
|
|
119
|
+
if not isinstance(parsed, list):
|
|
120
|
+
warnings.warn(f"LLM returned non-array JSON for '{left_val}': {raw!r}")
|
|
121
|
+
return []
|
|
122
|
+
|
|
123
|
+
# Collect all valid scored items above threshold, deduplicated by index
|
|
124
|
+
seen_indices: set[int] = set()
|
|
125
|
+
scored = []
|
|
126
|
+
for item in parsed:
|
|
127
|
+
idx = item.get("index", -1)
|
|
128
|
+
if not (0 <= idx < len(candidates)):
|
|
129
|
+
continue
|
|
130
|
+
if idx in seen_indices:
|
|
131
|
+
continue # LLM returned duplicate index — skip
|
|
132
|
+
seen_indices.add(idx)
|
|
133
|
+
score = float(item.get("score", 0.0))
|
|
134
|
+
if score >= threshold:
|
|
135
|
+
scored.append((score, idx, item.get("reasoning", "")))
|
|
136
|
+
|
|
137
|
+
if not scored:
|
|
138
|
+
return []
|
|
139
|
+
|
|
140
|
+
# Find best score, return ALL candidates that tie at that score
|
|
141
|
+
best_score = max(s for s, _, _ in scored)
|
|
142
|
+
tied = [(idx, reasoning) for score, idx, reasoning in scored if score == best_score]
|
|
143
|
+
|
|
144
|
+
# Annotate reasoning when multiple candidates tie — visible in return_reasoning output
|
|
145
|
+
tie_note = (
|
|
146
|
+
f" [tied: {len(tied)} candidates scored {best_score} — all joined]"
|
|
147
|
+
if len(tied) > 1 else ""
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return [
|
|
151
|
+
MatchResult(
|
|
152
|
+
left_val=left_val,
|
|
153
|
+
right_val=candidates[idx],
|
|
154
|
+
score=best_score,
|
|
155
|
+
reasoning=reasoning + tie_note,
|
|
156
|
+
embed_rank=idx,
|
|
157
|
+
)
|
|
158
|
+
for idx, reasoning in tied
|
|
159
|
+
]
|
|
@@ -0,0 +1,631 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-join
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Fuzzy join DataFrames using LLM scoring and embedding retrieval
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: entity-resolution,fuzzy-join,llm,nlp,pandas
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Requires-Dist: faiss-cpu>=1.7
|
|
9
|
+
Requires-Dist: numpy>=1.23
|
|
10
|
+
Requires-Dist: pandas>=1.5
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
|
|
13
|
+
Requires-Dist: pytest>=7; extra == 'dev'
|
|
14
|
+
Provides-Extra: sentence-transformers
|
|
15
|
+
Requires-Dist: sentence-transformers>=2.2; extra == 'sentence-transformers'
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# llm-join
|
|
19
|
+
|
|
20
|
+
**The pandas join that understands what your data means.**
|
|
21
|
+
|
|
22
|
+
`pd.merge` joins on exact values. `llm-join` joins on *meaning* — using embeddings to find candidates and an LLM you already have to decide if they match.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Table of Contents
|
|
27
|
+
|
|
28
|
+
- [The Problem](#the-problem)
|
|
29
|
+
- [Install](#install)
|
|
30
|
+
- [Quick Start](#quick-start)
|
|
31
|
+
- [Why llm-join](#why-llm-join)
|
|
32
|
+
- [Real-World Use Cases](#real-world-use-cases)
|
|
33
|
+
- [How It Works](#how-it-works)
|
|
34
|
+
- [Cost & Scale](#cost--scale)
|
|
35
|
+
- [The problem with naive LLM joins](#the-problem-with-naive-llm-joins)
|
|
36
|
+
- [Stage 1: Embeddings narrow the search](#stage-1-embeddings-narrow-the-search-cheap)
|
|
37
|
+
- [Stage 2: LLM scores only the hard cases](#stage-2-llm-scores-only-the-hard-cases-accurate)
|
|
38
|
+
- [Real cost example](#real-cost-example)
|
|
39
|
+
- [Further cost controls](#further-cost-controls)
|
|
40
|
+
- [Usage](#usage)
|
|
41
|
+
- [Basic join](#basic-join)
|
|
42
|
+
- [With domain context](#with-domain-context)
|
|
43
|
+
- [See why matches were made](#see-why-matches-were-made)
|
|
44
|
+
- [Control cost](#control-cost)
|
|
45
|
+
- [Left join (audit unmatched rows)](#left-join-audit-unmatched-rows)
|
|
46
|
+
- [Multi-column join key](#multi-column-join-key)
|
|
47
|
+
- [Chaining multiple joins](#chaining-multiple-joins)
|
|
48
|
+
- [Works with Any LLM](#works-with-any-llm)
|
|
49
|
+
- [Works with Any Embedding Function](#works-with-any-embedding-function)
|
|
50
|
+
- [vs. Alternatives](#vs-alternatives)
|
|
51
|
+
- [Parameters](#parameters)
|
|
52
|
+
- [License](#license)
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from llm_join import fuzzy_join
|
|
56
|
+
|
|
57
|
+
result = fuzzy_join(df1, df2, left_on="vendor", right_on="supplier_name", llm=my_llm, embed_fn=my_embed)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## The Problem
|
|
63
|
+
|
|
64
|
+
You have two DataFrames. Same data, different text:
|
|
65
|
+
|
|
66
|
+
| Your system | Their system |
|
|
67
|
+
|-------------|--------------|
|
|
68
|
+
| `Goldman Sachs & Co.` | `The Goldman Sachs Group Inc` |
|
|
69
|
+
| `Sony WH-1000XM5` | `SONY-WH1000XM5-BLK` |
|
|
70
|
+
| `MSFT Q4 license renewal` | `Microsoft Enterprise Agreement Q4-2024` |
|
|
71
|
+
| `Python programming` | `Python (language)` |
|
|
72
|
+
|
|
73
|
+
`pd.merge` returns nothing. Fuzzy string matching gets the wrong answer. You end up writing custom logic — or doing it by hand.
|
|
74
|
+
|
|
75
|
+
**llm-join solves this in one line.**
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Install
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
# From GitHub (not yet on PyPI)
|
|
83
|
+
pip install git+https://github.com/adityabalki/llm-join.git
|
|
84
|
+
|
|
85
|
+
# Or clone and install locally
|
|
86
|
+
git clone https://github.com/adityabalki/llm-join.git
|
|
87
|
+
cd llm-join
|
|
88
|
+
pip install -e .
|
|
89
|
+
|
|
90
|
+
# Or copy the wheel to air-gapped machines
|
|
91
|
+
pip install llm_join-0.1.0-py3-none-any.whl
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Quick Start
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
import pandas as pd
|
|
100
|
+
import openai
|
|
101
|
+
from llm_join import fuzzy_join
|
|
102
|
+
|
|
103
|
+
# Your data
|
|
104
|
+
df1 = pd.DataFrame({
|
|
105
|
+
"vendor": ["Goldman Sachs & Co.", "Amazon Web Services", "Microsoft Corp"],
|
|
106
|
+
"spend": [1_200_000, 890_000, 340_000]
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
df2 = pd.DataFrame({
|
|
110
|
+
"supplier_name": ["The Goldman Sachs Group Inc", "Amazon.com Inc.", "Microsoft Corporation"],
|
|
111
|
+
"category": ["Finance", "Cloud", "Software"]
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
# Wire up any LLM you already use
|
|
115
|
+
client = openai.OpenAI()
|
|
116
|
+
def llm(prompt):
|
|
117
|
+
return client.chat.completions.create(
|
|
118
|
+
model="gpt-4o-mini",
|
|
119
|
+
messages=[{"role": "user", "content": prompt}]
|
|
120
|
+
).choices[0].message.content
|
|
121
|
+
|
|
122
|
+
# Wire up your embedding function
|
|
123
|
+
import numpy as np
|
|
124
|
+
def my_embed(texts):
|
|
125
|
+
response = client.embeddings.create(model="text-embedding-3-small", input=texts)
|
|
126
|
+
return np.array([d.embedding for d in response.data], dtype="float32")
|
|
127
|
+
|
|
128
|
+
# Join (inner by default — only rows that matched)
|
|
129
|
+
result = fuzzy_join(
|
|
130
|
+
df1, df2,
|
|
131
|
+
left_on="vendor",
|
|
132
|
+
right_on="supplier_name",
|
|
133
|
+
llm=llm,
|
|
134
|
+
embed_fn=my_embed,
|
|
135
|
+
context="company names — match legal entity variants and abbreviations",
|
|
136
|
+
how="inner", # "inner" | "left" | "right" | "outer"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
print(result)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
| vendor | spend | supplier_name | category |
|
|
143
|
+
|---|---:|---|---|
|
|
144
|
+
| Goldman Sachs & Co. | 1,200,000 | The Goldman Sachs Group Inc | Finance |
|
|
145
|
+
| Amazon Web Services | 890,000 | Amazon.com Inc. | Cloud |
|
|
146
|
+
| Microsoft Corp | 340,000 | Microsoft Corporation | Software |
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Why llm-join
|
|
151
|
+
|
|
152
|
+
### vs. `pd.merge`
|
|
153
|
+
Exact string match only. Fails on any variation in naming.
|
|
154
|
+
|
|
155
|
+
### vs. fuzzy string matching (`fuzzywuzzy`, `rapidfuzz`)
|
|
156
|
+
Character similarity, not semantic meaning. `"iPhone 14 Pro"` vs `"iPhone 14 Pro Max"` scores high — but they are different products. `"CABLE-USBC-200CM-BLK"` vs `"USB-C charging cable 2m black"` scores near zero — even though they are the same item.
|
|
157
|
+
|
|
158
|
+
### vs. embedding similarity alone
|
|
159
|
+
Fast and cheap, but no reasoning. Can't explain *why* two values match or catch false positives confidently.
|
|
160
|
+
|
|
161
|
+
### llm-join
|
|
162
|
+
Embeddings narrow down candidates (fast, cheap). LLM makes the final call with context (accurate). You get the best of both.
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## Real-World Use Cases
|
|
167
|
+
|
|
168
|
+
| Domain | Left table | Right table | Problem |
|
|
169
|
+
|--------|-----------|-------------|---------|
|
|
170
|
+
| **Supply chain** | Buyer catalog SKU | Supplier SKU | Match products across 50+ vendor catalogs |
|
|
171
|
+
| **Finance** | Expense report payee | GL account / vendor master | Reconcile transactions automatically |
|
|
172
|
+
| **Legal / M&A** | Contract party name | Corporate registry | Identify true legal entity |
|
|
173
|
+
| **Compliance** | Customer name | OFAC sanctions list | Sanctions screening at scale |
|
|
174
|
+
| **Retail / e-commerce** | Marketplace product listing | Master product catalog | Deduplicate listings across 50+ sellers |
|
|
175
|
+
| **Logistics** | Shipment description | Harmonized tariff code | Auto-classify goods at customs |
|
|
176
|
+
| **E-commerce** | Marketplace listing | Master product catalog | Deduplicate across platforms |
|
|
177
|
+
| **Research** | Author name | Citation database | Disambiguate authors |
|
|
178
|
+
| **Government** | Vendor name | Tax registry | Consolidate procurement spend |
|
|
179
|
+
| **Real estate** | Raw address input | Property records DB | Standardize and match addresses |
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## How It Works
|
|
184
|
+
|
|
185
|
+
**Example:** A retailer's internal purchase orders use plain English product names. Their supplier sends a catalog with SKU codes. `pd.merge` matches nothing. llm-join bridges the gap.
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
orders_df = pd.DataFrame({"product_name": [
|
|
189
|
+
"USB-C charging cable 2m black",
|
|
190
|
+
"ergonomic mesh office chair",
|
|
191
|
+
"27-inch 4K monitor",
|
|
192
|
+
], "qty": [500, 30, 12]})
|
|
193
|
+
|
|
194
|
+
catalog_df = pd.DataFrame({"sku": [
|
|
195
|
+
"CABLE-USBC-200CM-BLK",
|
|
196
|
+
"CABLE-USBA-200CM-BLK",
|
|
197
|
+
"CHAIR-MESH-ERG-ADJUSTABLE",
|
|
198
|
+
"CHAIR-TASK-FIXED-BLK",
|
|
199
|
+
"MON-27-4K-IPS-HDMI2",
|
|
200
|
+
], "unit_price": [8.99, 6.49, 349.00, 189.00, 429.00]})
|
|
201
|
+
|
|
202
|
+
result = fuzzy_join(
|
|
203
|
+
orders_df, catalog_df,
|
|
204
|
+
left_on="product_name", right_on="sku",
|
|
205
|
+
llm=my_llm, embed_fn=my_embed,
|
|
206
|
+
context="procurement — match buyer product descriptions to supplier SKU codes",
|
|
207
|
+
top_k=3, threshold=0.7,
|
|
208
|
+
)
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### Step 1 — Embed both columns
|
|
212
|
+
|
|
213
|
+
Every value is converted to a vector. No API call — pure math, milliseconds.
|
|
214
|
+
|
|
215
|
+
| Value | Meaning captured in vector |
|
|
216
|
+
|---|---|
|
|
217
|
+
| `"USB-C charging cable 2m black"` | cable / USB-C / length / color |
|
|
218
|
+
| `"ergonomic mesh office chair"` | seating / ergonomic / mesh |
|
|
219
|
+
| `"27-inch 4K monitor"` | display / size / resolution |
|
|
220
|
+
| `"CABLE-USBC-200CM-BLK"` | cable / USB-C / 200cm / black |
|
|
221
|
+
| `"CHAIR-MESH-ERG-ADJUSTABLE"` | seating / mesh / ergonomic |
|
|
222
|
+
| `"MON-27-4K-IPS-HDMI2"` | display / 27in / 4K |
|
|
223
|
+
|
|
224
|
+
### Step 2 — FAISS retrieves top-K candidates (no LLM)
|
|
225
|
+
|
|
226
|
+
For each left row, faiss finds the `top_k` closest right vectors by cosine similarity. Everything else is eliminated — no LLM call needed.
|
|
227
|
+
|
|
228
|
+
**Query: `"USB-C charging cable 2m black"` → top_k=3**
|
|
229
|
+
|
|
230
|
+
| Rank | Candidate | Embed Score | Reaches LLM? |
|
|
231
|
+
|---:|---|---:|---|
|
|
232
|
+
| 0 | `CABLE-USBC-200CM-BLK` | 0.89 | ✓ yes |
|
|
233
|
+
| 1 | `CABLE-USBA-200CM-BLK` | 0.71 | ✓ yes |
|
|
234
|
+
| 2 | `CHAIR-TASK-FIXED-BLK` | 0.34 | ✓ yes (shares "BLK") |
|
|
235
|
+
| — | `CHAIR-MESH-ERG-ADJUSTABLE` | 0.11 | ✗ eliminated |
|
|
236
|
+
| — | `MON-27-4K-IPS-HDMI2` | 0.08 | ✗ eliminated |
|
|
237
|
+
|
|
238
|
+
**Query: `"ergonomic mesh office chair"` → top_k=3**
|
|
239
|
+
|
|
240
|
+
| Rank | Candidate | Embed Score | Reaches LLM? |
|
|
241
|
+
|---:|---|---:|---|
|
|
242
|
+
| 0 | `CHAIR-MESH-ERG-ADJUSTABLE` | 0.91 | ✓ yes |
|
|
243
|
+
| 1 | `CHAIR-TASK-FIXED-BLK` | 0.74 | ✓ yes |
|
|
244
|
+
| 2 | `CABLE-USBC-200CM-BLK` | 0.19 | ✓ yes |
|
|
245
|
+
| — | `MON-27-4K-IPS-HDMI2` | 0.06 | ✗ eliminated |
|
|
246
|
+
| — | `CABLE-USBA-200CM-BLK` | 0.14 | ✗ eliminated |
|
|
247
|
+
|
|
248
|
+
**Query: `"27-inch 4K monitor"` → top_k=3**
|
|
249
|
+
|
|
250
|
+
| Rank | Candidate | Embed Score | Reaches LLM? |
|
|
251
|
+
|---:|---|---:|---|
|
|
252
|
+
| 0 | `MON-27-4K-IPS-HDMI2` | 0.94 | ✓ yes |
|
|
253
|
+
| 1 | `CABLE-USBC-200CM-BLK` | 0.22 | ✓ yes |
|
|
254
|
+
| 2 | `CHAIR-TASK-FIXED-BLK` | 0.17 | ✓ yes |
|
|
255
|
+
| — | `CHAIR-MESH-ERG-ADJUSTABLE` | 0.09 | ✗ eliminated |
|
|
256
|
+
| — | `CABLE-USBA-200CM-BLK` | 0.12 | ✗ eliminated |
|
|
257
|
+
|
|
258
|
+
**Result: 3 LLM calls instead of 3 × 5 = 15 pair-by-pair calls.**
|
|
259
|
+
|
|
260
|
+
### Step 3 — One LLM call per left row scores all candidates
|
|
261
|
+
|
|
262
|
+
All top-K candidates go into a single prompt. LLM returns a JSON array — one API call, all candidates scored.
|
|
263
|
+
|
|
264
|
+
**Prompt sent for `"USB-C charging cable 2m black"`:**
|
|
265
|
+
```
|
|
266
|
+
Context: procurement — match buyer product descriptions to supplier SKU codes
|
|
267
|
+
|
|
268
|
+
LEFT: "USB-C charging cable 2m black"
|
|
269
|
+
|
|
270
|
+
Score each candidate (0.0–1.0):
|
|
271
|
+
0. CABLE-USBC-200CM-BLK
|
|
272
|
+
1. CABLE-USBA-200CM-BLK
|
|
273
|
+
2. CHAIR-TASK-FIXED-BLK
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
**LLM response:**
|
|
277
|
+
```json
|
|
278
|
+
[
|
|
279
|
+
{"index": 0, "score": 0.97, "reasoning": "USBC = USB-C, 200CM = 2m, BLK = black. Exact match on all three specs."},
|
|
280
|
+
{"index": 1, "score": 0.38, "reasoning": "Correct length and color but USBA is USB-A, not USB-C. Wrong connector type."},
|
|
281
|
+
{"index": 2, "score": 0.04, "reasoning": "This is a chair SKU. No relation to a cable."}
|
|
282
|
+
]
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
**Apply threshold=0.7:**
|
|
286
|
+
|
|
287
|
+
| Candidate | LLM Score | Decision |
|
|
288
|
+
|---|---:|---|
|
|
289
|
+
| `CABLE-USBC-200CM-BLK` | 0.97 | ✓ **best match — joined** |
|
|
290
|
+
| `CABLE-USBA-200CM-BLK` | 0.38 | ✗ below threshold |
|
|
291
|
+
| `CHAIR-TASK-FIXED-BLK` | 0.04 | ✗ below threshold |
|
|
292
|
+
|
|
293
|
+
### Step 4 — Merge matched rows
|
|
294
|
+
|
|
295
|
+
```python
|
|
296
|
+
print(result)
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
| product_name | qty | sku | unit_price |
|
|
300
|
+
|---|---:|---|---:|
|
|
301
|
+
| USB-C charging cable 2m black | 500 | CABLE-USBC-200CM-BLK | 8.99 |
|
|
302
|
+
| ergonomic mesh office chair | 30 | CHAIR-MESH-ERG-ADJUSTABLE | 349.00 |
|
|
303
|
+
| 27-inch 4K monitor | 12 | MON-27-4K-IPS-HDMI2 | 429.00 |
|
|
304
|
+
|
|
305
|
+
The LLM correctly rejected `CABLE-USBA-200CM-BLK` (wrong connector) even though embedding scored it 0.71 — this is the case where LLM reasoning earns its cost.
|
|
306
|
+
|
|
307
|
+
---
|
|
308
|
+
|
|
309
|
+
## Cost & Scale
|
|
310
|
+
|
|
311
|
+
### The problem with naive LLM joins
|
|
312
|
+
|
|
313
|
+
If you sent every possible pair to the LLM:
|
|
314
|
+
|
|
315
|
+
| Left rows | Right rows | Pairs to score | Cost (gpt-4o-mini ~$0.30/1M tokens) |
|
|
316
|
+
|-----------|------------|----------------|--------------------------------------|
|
|
317
|
+
| 1,000 | 10,000 | 10,000,000 | ~$150 |
|
|
318
|
+
| 10,000 | 100,000 | 1,000,000,000 | ~$15,000 |
|
|
319
|
+
| 100,000 | 1,000,000 | 100,000,000,000 | impossible |
|
|
320
|
+
|
|
321
|
+
**llm-join solves this with a two-stage pipeline.**
|
|
322
|
+
|
|
323
|
+
### Stage 1: Embeddings narrow the search (cheap)
|
|
324
|
+
|
|
325
|
+
Convert every value to a vector. Use faiss to find the top-K most similar candidates per row. This is pure math — no API calls, runs in milliseconds.
|
|
326
|
+
|
|
327
|
+
| | |
|
|
328
|
+
|---|---|
|
|
329
|
+
| Input pairs | 10,000 × 100,000 = **1,000,000,000** |
|
|
330
|
+
| After embed + faiss (`top_k=5`) | **50,000** candidate pairs |
|
|
331
|
+
| Eliminated for free | **99.995%** of all pairs |
|
|
332
|
+
|
|
333
|
+
### Stage 2: LLM scores only the hard cases (accurate)
|
|
334
|
+
|
|
335
|
+
Your LLM sees a small batch of plausible candidates per row — not the full cross product. It scores each candidate; the highest score above `threshold` wins. One best match per left row.
|
|
336
|
+
|
|
337
|
+
Query: `"USB-C charging cable 2m black"`
|
|
338
|
+
|
|
339
|
+
| Rank | Candidate | LLM Score | Decision |
|
|
340
|
+
|---:|---|---:|---|
|
|
341
|
+
| 0 | `CABLE-USBC-200CM-BLK` | 0.97 | ✓ **best match — joined** |
|
|
342
|
+
| 1 | `CABLE-USBC-100CM-BLK` | 0.71 | scored, not selected (wrong length) |
|
|
343
|
+
| 2 | `CABLE-USBA-200CM-BLK` | 0.38 | scored, not selected (wrong connector) |
|
|
344
|
+
| 3 | `CHAIR-TASK-FIXED-BLK` | 0.04 | ✗ below threshold |
|
|
345
|
+
| 4 | `MON-27-4K-IPS-HDMI2` | 0.01 | ✗ below threshold |
|
|
346
|
+
|
|
347
|
+
### Real cost example
|
|
348
|
+
|
|
349
|
+
| Setup | LLM calls | Estimated cost |
|
|
350
|
+
|-------|-----------|----------------|
|
|
351
|
+
| 10k × 100k, top_k=5 | 50,000 | ~$0.75 |
|
|
352
|
+
| 10k × 100k, top_k=3 | 30,000 | ~$0.45 |
|
|
353
|
+
| 10k × 100k, embed_threshold=0.95 | ~5,000 (obvious matches skip LLM) | ~$0.08 |
|
|
354
|
+
|
|
355
|
+
### Further cost controls
|
|
356
|
+
|
|
357
|
+
```python
|
|
358
|
+
result = fuzzy_join(
|
|
359
|
+
df1, df2,
|
|
360
|
+
left_on="vendor", right_on="supplier",
|
|
361
|
+
llm=my_llm,
|
|
362
|
+
embed_fn=my_embed,
|
|
363
|
+
top_k=3, # fewer candidates = fewer LLM tokens per row
|
|
364
|
+
embed_threshold=0.95, # skip LLM entirely if embedding match score > 0.95
|
|
365
|
+
max_llm_calls=1000, # hard cap — warns and returns partial result if hit
|
|
366
|
+
)
|
|
367
|
+
```
|
|
368
|
+
|
|
369
|
+
| Parameter | Effect |
|
|
370
|
+
|-----------|--------|
|
|
371
|
+
| `top_k=3` (default 5) | 40% fewer LLM tokens |
|
|
372
|
+
| `embed_threshold=0.95` | Skip LLM for obvious matches — typically saves 30–60% |
|
|
373
|
+
| `max_llm_calls=N` | Budget guard — never exceeds N LLM calls |
|
|
374
|
+
|
|
375
|
+
---
|
|
376
|
+
|
|
377
|
+
## Usage
|
|
378
|
+
|
|
379
|
+
### Basic join
|
|
380
|
+
|
|
381
|
+
```python
|
|
382
|
+
result = fuzzy_join(
|
|
383
|
+
orders_df, catalog_df,
|
|
384
|
+
left_on="product_name",
|
|
385
|
+
right_on="sku",
|
|
386
|
+
llm=my_llm,
|
|
387
|
+
embed_fn=my_embed,
|
|
388
|
+
context="procurement — match buyer product descriptions to supplier SKU codes",
|
|
389
|
+
)
|
|
390
|
+
```
|
|
391
|
+
|
|
392
|
+
### With domain context
|
|
393
|
+
|
|
394
|
+
```python
|
|
395
|
+
result = fuzzy_join(
|
|
396
|
+
orders_df, catalog_df,
|
|
397
|
+
left_on="product_name",
|
|
398
|
+
right_on="sku",
|
|
399
|
+
llm=my_llm,
|
|
400
|
+
embed_fn=my_embed,
|
|
401
|
+
context="procurement — match buyer product descriptions to supplier SKU codes",
|
|
402
|
+
column_context={
|
|
403
|
+
"product_name": "plain English product description written by a buyer",
|
|
404
|
+
"sku": "supplier stock-keeping unit code, typically uppercase with hyphens",
|
|
405
|
+
},
|
|
406
|
+
)
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
### See why matches were made
|
|
410
|
+
|
|
411
|
+
```python
|
|
412
|
+
result = fuzzy_join(
|
|
413
|
+
df1, df2,
|
|
414
|
+
left_on="vendor",
|
|
415
|
+
right_on="supplier_name",
|
|
416
|
+
llm=my_llm,
|
|
417
|
+
embed_fn=my_embed,
|
|
418
|
+
return_reasoning=True,
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
print(result[["vendor", "supplier_name", "_llm_score", "_llm_reasoning", "_embed_rank", "_match_method"]])
|
|
422
|
+
```
|
|
423
|
+
|
|
424
|
+
| vendor | supplier_name | _llm_score | _llm_reasoning | _embed_rank | _match_method |
|
|
425
|
+
|---|---|---:|---|---:|---|
|
|
426
|
+
| Goldman Sachs & Co. | The Goldman Sachs Group Inc | 0.97 | same firm, legal name variant | 0 | llm |
|
|
427
|
+
|
|
428
|
+
### Control cost
|
|
429
|
+
|
|
430
|
+
```python
|
|
431
|
+
result = fuzzy_join(
|
|
432
|
+
df1, df2,
|
|
433
|
+
left_on="vendor",
|
|
434
|
+
right_on="supplier_name",
|
|
435
|
+
llm=my_llm,
|
|
436
|
+
embed_fn=my_embed,
|
|
437
|
+
embed_threshold=0.95, # skip LLM if embedding match is obvious
|
|
438
|
+
max_llm_calls=500, # hard cap — warns and returns partial result if hit
|
|
439
|
+
top_k=3, # fewer candidates = fewer LLM tokens
|
|
440
|
+
)
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
### Left join (audit unmatched rows)
|
|
444
|
+
|
|
445
|
+
`how="left"` keeps all left rows — unmatched ones get NaN right columns. Useful to see what the LLM failed to match.
|
|
446
|
+
|
|
447
|
+
```python
|
|
448
|
+
result = fuzzy_join(df1, df2, left_on="a", right_on="b", llm=my_llm, embed_fn=my_embed, how="left")
|
|
449
|
+
|
|
450
|
+
# Rows with no match
|
|
451
|
+
unmatched = result[result["b"].isna()]
|
|
452
|
+
```
|
|
453
|
+
|
|
454
|
+
> **Note:** `how="outer"` is useful for reconciliation — unmatched left rows are left values with no match above threshold; unmatched right rows are right values that were never selected as a best match for any left row. `cross` join is not supported (it would be the naive O(n×m) approach that llm-join is designed to avoid).
|
|
455
|
+
|
|
456
|
+
### Multi-column join key
|
|
457
|
+
|
|
458
|
+
```python
|
|
459
|
+
# orders_df has separate "product_name" and "category" columns
|
|
460
|
+
# catalog_df has a single "sku_description" column like "CABLE / USB-C / 200CM / BLK"
|
|
461
|
+
|
|
462
|
+
result = fuzzy_join(
|
|
463
|
+
orders_df, catalog_df,
|
|
464
|
+
left_on=["product_name", "category"], # concatenated: "USB-C cable 2m · electronics"
|
|
465
|
+
right_on="sku_description",
|
|
466
|
+
llm=my_llm,
|
|
467
|
+
embed_fn=my_embed,
|
|
468
|
+
context="procurement — match buyer product + category to supplier SKU description",
|
|
469
|
+
)
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
### Chaining multiple joins
|
|
473
|
+
|
|
474
|
+
Each `fuzzy_join` returns a regular DataFrame — pipe them like `pd.merge`.
|
|
475
|
+
|
|
476
|
+
```python
|
|
477
|
+
# df1: transactions (vendor + product columns)
|
|
478
|
+
# df2: vendor master
|
|
479
|
+
# df3: product catalog
|
|
480
|
+
|
|
481
|
+
# Step 1 — match vendors
|
|
482
|
+
step1 = fuzzy_join(
|
|
483
|
+
df1, df2,
|
|
484
|
+
left_on="vendor",
|
|
485
|
+
right_on="supplier_name",
|
|
486
|
+
llm=my_llm,
|
|
487
|
+
embed_fn=my_embed,
|
|
488
|
+
how="left",
|
|
489
|
+
return_reasoning=True,
|
|
490
|
+
)
|
|
491
|
+
step1 = step1.rename(columns={
|
|
492
|
+
"_llm_score": "_vendor_score",
|
|
493
|
+
"_llm_reasoning": "_vendor_reasoning",
|
|
494
|
+
"_match_method": "_vendor_method",
|
|
495
|
+
})
|
|
496
|
+
|
|
497
|
+
# Step 2 — match products on result of step 1
|
|
498
|
+
result = fuzzy_join(
|
|
499
|
+
step1, df3,
|
|
500
|
+
left_on="product",
|
|
501
|
+
right_on="catalog_item",
|
|
502
|
+
llm=my_llm,
|
|
503
|
+
embed_fn=my_embed,
|
|
504
|
+
how="left",
|
|
505
|
+
return_reasoning=True,
|
|
506
|
+
)
|
|
507
|
+
# result now has _vendor_score + _llm_score without column collision
|
|
508
|
+
```
|
|
509
|
+
|
|
510
|
+
---
|
|
511
|
+
|
|
512
|
+
## Works with Any LLM
|
|
513
|
+
|
|
514
|
+
Pass any callable that takes a prompt string and returns a string.
|
|
515
|
+
|
|
516
|
+
```python
|
|
517
|
+
# OpenAI
|
|
518
|
+
import openai
|
|
519
|
+
client = openai.OpenAI()
|
|
520
|
+
llm = lambda p: client.chat.completions.create(
|
|
521
|
+
model="gpt-4o-mini",
|
|
522
|
+
messages=[{"role": "user", "content": p}]
|
|
523
|
+
).choices[0].message.content
|
|
524
|
+
|
|
525
|
+
# Anthropic
|
|
526
|
+
import anthropic
|
|
527
|
+
client = anthropic.Anthropic()
|
|
528
|
+
llm = lambda p: client.messages.create(
|
|
529
|
+
model="claude-opus-4-7", max_tokens=512,
|
|
530
|
+
messages=[{"role": "user", "content": p}]
|
|
531
|
+
).content[0].text
|
|
532
|
+
|
|
533
|
+
# Google Gemini
|
|
534
|
+
import google.generativeai as genai
|
|
535
|
+
model = genai.GenerativeModel("gemini-2.0-flash")
|
|
536
|
+
llm = lambda p: model.generate_content(p).text
|
|
537
|
+
|
|
538
|
+
# Ollama (local, free)
|
|
539
|
+
import ollama
|
|
540
|
+
llm = lambda p: ollama.chat(
|
|
541
|
+
model="llama3.2", messages=[{"role": "user", "content": p}]
|
|
542
|
+
)["message"]["content"]
|
|
543
|
+
|
|
544
|
+
# Any custom endpoint
|
|
545
|
+
import requests
|
|
546
|
+
llm = lambda p: requests.post(
|
|
547
|
+
"https://your-llm-api.com/chat",
|
|
548
|
+
json={"prompt": p}
|
|
549
|
+
).json()["response"]
|
|
550
|
+
```
|
|
551
|
+
|
|
552
|
+
## Works with Any Embedding Function
|
|
553
|
+
|
|
554
|
+
Pass any callable `(list[str]) -> np.ndarray` (shape `[n, dim]`, dtype `float32`).
|
|
555
|
+
|
|
556
|
+
```python
|
|
557
|
+
import numpy as np
|
|
558
|
+
|
|
559
|
+
# OpenAI embeddings
|
|
560
|
+
import openai
|
|
561
|
+
client = openai.OpenAI()
|
|
562
|
+
def my_embed(texts):
|
|
563
|
+
response = client.embeddings.create(model="text-embedding-3-small", input=texts)
|
|
564
|
+
return np.array([d.embedding for d in response.data], dtype="float32")
|
|
565
|
+
|
|
566
|
+
# Cohere
|
|
567
|
+
import cohere
|
|
568
|
+
co = cohere.Client("YOUR_KEY")
|
|
569
|
+
def my_embed(texts):
|
|
570
|
+
response = co.embed(texts=texts, model="embed-english-v3.0", input_type="search_document")
|
|
571
|
+
return np.array(response.embeddings, dtype="float32")
|
|
572
|
+
|
|
573
|
+
# sentence-transformers (local)
|
|
574
|
+
from sentence_transformers import SentenceTransformer
|
|
575
|
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
576
|
+
def my_embed(texts):
|
|
577
|
+
return model.encode(texts, convert_to_numpy=True).astype("float32")
|
|
578
|
+
|
|
579
|
+
# Any custom endpoint
|
|
580
|
+
import requests
|
|
581
|
+
def my_embed(texts):
|
|
582
|
+
response = requests.post(
|
|
583
|
+
"https://your-embed-api.com/embed",
|
|
584
|
+
json={"texts": texts}
|
|
585
|
+
).json()
|
|
586
|
+
return np.array(response["embeddings"], dtype="float32")
|
|
587
|
+
```
|
|
588
|
+
|
|
589
|
+
---
|
|
590
|
+
|
|
591
|
+
## vs. Alternatives
|
|
592
|
+
|
|
593
|
+
| | llm-join | pd.merge | fuzzywuzzy | Jellyjoin | LinkTransformer |
|
|
594
|
+
|---|:---:|:---:|:---:|:---:|:---:|
|
|
595
|
+
| Semantic matching | ✓ | ✗ | ✗ | ✓ | ✓ |
|
|
596
|
+
| LLM makes final decision | ✓ | ✗ | ✗ | ✗ | optional |
|
|
597
|
+
| Reasoning per match | ✓ | ✗ | ✗ | ✗ | ✗ |
|
|
598
|
+
| Domain context injection | ✓ | ✗ | ✗ | ✗ | ✗ |
|
|
599
|
+
| Bring-your-own LLM callable | ✓ | n/a | n/a | ✗ | ✗ |
|
|
600
|
+
| Bring-your-own embed callable | ✓ | n/a | n/a | ✗ | ✗ |
|
|
601
|
+
| Enterprise-safe (no forced downloads) | ✓ | ✓ | ✓ | partial | ✗ |
|
|
602
|
+
| Hard cost cap (`max_llm_calls`) | ✓ | n/a | n/a | ✗ | ✗ |
|
|
603
|
+
| License | **MIT** | BSD | MIT | MIT | **GPL-3.0** |
|
|
604
|
+
| Install size | ~50 MB | ~20 MB | ~30 MB | ~60 MB | **~3 GB** |
|
|
605
|
+
|
|
606
|
+
---
|
|
607
|
+
|
|
608
|
+
## Parameters
|
|
609
|
+
|
|
610
|
+
| Parameter | Default | Description |
|
|
611
|
+
|-----------|---------|-------------|
|
|
612
|
+
| `left_on` | required | Column name(s) in df1 |
|
|
613
|
+
| `right_on` | required | Column name(s) in df2 |
|
|
614
|
+
| `llm` | required | Callable `(prompt: str) -> str` — your LLM function |
|
|
615
|
+
| `embed_fn` | required | Callable `(list[str]) -> np.ndarray` — your embedding function |
|
|
616
|
+
| `context` | required | Domain context injected into LLM prompt — describe what the columns represent and what kind of match to make |
|
|
617
|
+
| `column_context` | `{}` | Per-column context dict `{"col": "description"}` |
|
|
618
|
+
| `top_k` | `5` | Embedding candidates retrieved per row before LLM scoring |
|
|
619
|
+
| `batch_size` | `32` | Reserved for future LLM batching (passed through to config) |
|
|
620
|
+
| `threshold` | `0.7` | Minimum LLM score (0–1) to accept a match |
|
|
621
|
+
| `how` | `"inner"` | Join type: `inner` (matched pairs only) / `left` (all left rows, NaN where unmatched) / `right` (all right rows, NaN where no left row matched them) / `outer` (full picture — both unmatched left and unmatched right rows). |
|
|
622
|
+
| `embed_threshold` | `None` | Skip LLM when embedding score is decisive (saves cost) |
|
|
623
|
+
| `max_llm_calls` | `None` | Hard cap on LLM calls — returns partial result with warning if hit |
|
|
624
|
+
| `max_retries` | `3` | Retry failed LLM calls with exponential backoff (1s, 2s, 4s…). Set `0` to disable. |
|
|
625
|
+
| `return_reasoning` | `False` | Append `_llm_score`, `_llm_reasoning`, `_embed_rank`, `_match_method` columns. `_match_method` is `"llm"` or `"embed_threshold"` — tells you which rows skipped the LLM. |
|
|
626
|
+
|
|
627
|
+
---
|
|
628
|
+
|
|
629
|
+
## License
|
|
630
|
+
|
|
631
|
+
MIT
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
llm_join/__init__.py,sha256=6xpu-ZnwfWvtH-uOX3PY4jgAA1I5D8WSp8xL__kVyp4,85
|
|
2
|
+
llm_join/config.py,sha256=ne_2cY1HFGkUMTa_YSRqU2W41LytM9PQQWYR8c48j2Q,2130
|
|
3
|
+
llm_join/join.py,sha256=KTMzgIpdp9hypq7ahnv2XZBs0nzwkJ46vrAPHcRPgb8,4537
|
|
4
|
+
llm_join/merger.py,sha256=S92_aMtgxQohUzqoVeTaqQBuq3zJ29L4xbzClf-0kzg,2306
|
|
5
|
+
llm_join/prompts.py,sha256=soh7WKIKExgN8cSMFxkVKsOLGuYKGyTlXaLrVRF0KJQ,625
|
|
6
|
+
llm_join/retriever.py,sha256=mY-Rx1TWpHOqTA7dxgxIN_KzW-mhasTf9OXhx1x0Fu8,2012
|
|
7
|
+
llm_join/scorer.py,sha256=3zwMFfFigo6XoSDq0IgFrypRcsNd9OdGc2hW3iY-BhQ,5821
|
|
8
|
+
llm_join-0.2.0.dist-info/METADATA,sha256=lCji78ey-p8GhSvaWYHJLjwmV02Ms1C5f6kSOnDHNQg,21237
|
|
9
|
+
llm_join-0.2.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
10
|
+
llm_join-0.2.0.dist-info/RECORD,,
|