llm-join 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llm_join/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from llm_join.join import fuzzy_join
2
+
3
+ __all__ = ["fuzzy_join"]
4
+ __version__ = "0.1.0"
llm_join/config.py ADDED
@@ -0,0 +1,52 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Callable, Optional
3
+
4
+ import numpy as np
5
+
6
+
7
+ @dataclass
8
+ class ColumnConfig:
9
+ left_col: str
10
+ right_col: str
11
+ embed_fn: Callable[[list[str]], np.ndarray]
12
+ context: str = ""
13
+ column_context: dict[str, str] = field(default_factory=dict)
14
+ top_k: int = 5
15
+ threshold: float = 0.7
16
+ batch_size: int = 32
17
+ embed_threshold: Optional[float] = None
18
+ max_llm_calls: Optional[int] = None
19
+ max_retries: int = 3
20
+
21
+ def __post_init__(self):
22
+ if not self.left_col:
23
+ raise ValueError("left_col must not be empty")
24
+ if not self.right_col:
25
+ raise ValueError("right_col must not be empty")
26
+ if not self.context or not self.context.strip():
27
+ raise ValueError(
28
+ "context must not be empty — describe what the columns represent and what kind of match to make. "
29
+ "Example: \"pharmaceutical drug names — match generic INN names to US brand names\""
30
+ )
31
+ if not 0.0 < self.threshold <= 1.0:
32
+ raise ValueError(f"threshold must be in (0, 1], got {self.threshold}")
33
+ if self.top_k < 1:
34
+ raise ValueError(f"top_k must be >= 1, got {self.top_k}")
35
+ if self.batch_size < 1:
36
+ raise ValueError(f"batch_size must be >= 1, got {self.batch_size}")
37
+ if self.embed_threshold is not None and not 0.0 < self.embed_threshold <= 1.0:
38
+ raise ValueError(f"embed_threshold must be in (0, 1], got {self.embed_threshold}")
39
+ if self.max_llm_calls is not None and self.max_llm_calls < 1:
40
+ raise ValueError(f"max_llm_calls must be >= 1, got {self.max_llm_calls}")
41
+ if self.max_retries < 0:
42
+ raise ValueError(f"max_retries must be >= 0, got {self.max_retries}")
43
+
44
+ @property
45
+ def context_str(self) -> str:
46
+ parts = []
47
+ if self.context:
48
+ parts.append(self.context)
49
+ for col in (self.left_col, self.right_col):
50
+ if col in self.column_context:
51
+ parts.append(f"{col}: {self.column_context[col]}")
52
+ return ". ".join(parts)
llm_join/join.py ADDED
@@ -0,0 +1,129 @@
1
+ import warnings
2
+ from typing import Callable, Optional, Union
3
+ import pandas as pd
4
+
5
+ from llm_join.config import ColumnConfig
6
+ from llm_join.retriever import EmbeddingRetriever
7
+ from llm_join.scorer import LLMScorer
8
+ from llm_join.merger import Merger, MatchResult
9
+
10
+
11
+ def fuzzy_join(
12
+ df1: pd.DataFrame,
13
+ df2: pd.DataFrame,
14
+ *,
15
+ left_on: Union[str, list[str]],
16
+ right_on: Union[str, list[str]],
17
+ llm: Callable,
18
+ embed_fn: Callable,
19
+ context: str,
20
+ column_context: Optional[dict] = None,
21
+ top_k: int = 5,
22
+ threshold: float = 0.7,
23
+ how: str = "inner",
24
+ batch_size: int = 32,
25
+ embed_threshold: Optional[float] = None,
26
+ max_llm_calls: Optional[int] = None,
27
+ max_retries: int = 3,
28
+ return_reasoning: bool = False,
29
+ ) -> pd.DataFrame:
30
+ # Normalise column names to single string (multi-col join concatenates values)
31
+ left_col, right_col, df1, df2 = _normalise_cols(df1, df2, left_on, right_on)
32
+
33
+ cfg = ColumnConfig(
34
+ left_col=left_col,
35
+ right_col=right_col,
36
+ embed_fn=embed_fn,
37
+ context=context,
38
+ column_context=column_context or {},
39
+ top_k=top_k,
40
+ threshold=threshold,
41
+ batch_size=batch_size,
42
+ embed_threshold=embed_threshold,
43
+ max_llm_calls=max_llm_calls,
44
+ max_retries=max_retries,
45
+ )
46
+
47
+ retriever = EmbeddingRetriever(embed_fn=cfg.embed_fn)
48
+ scorer = LLMScorer(llm, max_retries=cfg.max_retries)
49
+ merger = Merger()
50
+
51
+ left_vals = df1[left_col].astype(str).tolist()
52
+ right_vals = df2[right_col].astype(str).tolist()
53
+
54
+ # Use retrieve_with_scores for embed_threshold path
55
+ candidates_per_row = retriever.retrieve_with_scores(left_vals, right_vals, top_k=cfg.top_k)
56
+
57
+ matches: list[MatchResult] = []
58
+ llm_call_count = 0
59
+
60
+ for left_val, candidates_with_scores in zip(left_vals, candidates_per_row):
61
+ if not candidates_with_scores:
62
+ continue
63
+
64
+ # embed_threshold short-circuit
65
+ if cfg.embed_threshold is not None:
66
+ best_candidate, best_score = candidates_with_scores[0] # already sorted by score desc
67
+ if best_score >= cfg.embed_threshold:
68
+ matches.append(MatchResult(
69
+ left_val=left_val,
70
+ right_val=best_candidate,
71
+ score=best_score,
72
+ reasoning="skipped — embed score above threshold",
73
+ embed_rank=0,
74
+ match_method="embed_threshold",
75
+ ))
76
+ continue
77
+ if best_score < (1.0 - cfg.embed_threshold):
78
+ warnings.warn(
79
+ f"Row '{left_val}' skipped: top embed score {best_score:.3f} "
80
+ f"below non-match threshold {1.0 - cfg.embed_threshold:.3f}",
81
+ UserWarning,
82
+ stacklevel=2,
83
+ )
84
+ continue
85
+
86
+ # max_llm_calls cap
87
+ if cfg.max_llm_calls is not None and llm_call_count >= cfg.max_llm_calls:
88
+ warnings.warn(
89
+ f"max_llm_calls={cfg.max_llm_calls} reached. "
90
+ "Remaining rows skipped. Result is partial.",
91
+ UserWarning,
92
+ stacklevel=2,
93
+ )
94
+ break
95
+
96
+ candidates = [c for c, _ in candidates_with_scores]
97
+ results = scorer.score(left_val, candidates, cfg.context_str, threshold=cfg.threshold)
98
+ llm_call_count += 1
99
+ if results is None:
100
+ # LLM failed all retries — fall back to highest-scoring embed candidate
101
+ best_candidate, best_embed_score = candidates_with_scores[0]
102
+ matches.append(MatchResult(
103
+ left_val=left_val,
104
+ right_val=best_candidate,
105
+ score=best_embed_score,
106
+ reasoning="LLM failed — embed rank-0 fallback used",
107
+ embed_rank=0,
108
+ match_method="embed_fallback",
109
+ ))
110
+ elif results:
111
+ matches.extend(results)
112
+
113
+ return merger.merge(df1, df2, left_col, right_col, matches, how=how, return_reasoning=return_reasoning)
114
+
115
+
116
+ def _normalise_cols(df1, df2, left_on, right_on):
117
+ if isinstance(left_on, list):
118
+ col = "__left_key__"
119
+ df1 = df1.copy()
120
+ df1[col] = df1[left_on].astype(str).agg(" ".join, axis=1)
121
+ left_on = col
122
+ if isinstance(right_on, list):
123
+ col = "__right_key__"
124
+ df2 = df2.copy()
125
+ df2[col] = df2[right_on].astype(str).agg(" ".join, axis=1)
126
+ right_on = col
127
+ return left_on, right_on, df1, df2
128
+
129
+
llm_join/merger.py ADDED
@@ -0,0 +1,64 @@
1
+ from dataclasses import dataclass
2
+ import pandas as pd
3
+
4
+
5
+ @dataclass
6
+ class MatchResult:
7
+ left_val: str
8
+ right_val: str
9
+ score: float
10
+ reasoning: str
11
+ embed_rank: int = 0
12
+ match_method: str = "llm" # "llm" or "embed_threshold"
13
+
14
+
15
+ class Merger:
16
+ def merge(
17
+ self,
18
+ df1: pd.DataFrame,
19
+ df2: pd.DataFrame,
20
+ left_col: str,
21
+ right_col: str,
22
+ matches: list[MatchResult],
23
+ how: str = "inner",
24
+ return_reasoning: bool = False,
25
+ ) -> pd.DataFrame:
26
+ if how not in ("inner", "left", "right", "outer"):
27
+ raise ValueError(f"how must be inner/left/right/outer, got '{how}'")
28
+ if right_col in df1.columns and right_col != left_col:
29
+ raise ValueError(
30
+ f"right_col '{right_col}' already exists in df1; rename before merging"
31
+ )
32
+
33
+ if not matches:
34
+ empty = pd.DataFrame(columns=list(df1.columns) + [
35
+ c for c in df2.columns if c != right_col or right_col == left_col
36
+ ])
37
+ if how == "inner":
38
+ return empty
39
+ if how == "left":
40
+ return pd.merge(df1, df2.iloc[:0], left_on=left_col, right_on=right_col, how="left")
41
+ if how == "right":
42
+ return pd.merge(df1.iloc[:0], df2, left_on=left_col, right_on=right_col, how="right")
43
+ # outer: return both frames with NaN fills
44
+ return pd.merge(df1, df2, left_on=left_col, right_on=right_col, how="outer")
45
+
46
+ match_df = pd.DataFrame({
47
+ left_col: [m.left_val for m in matches],
48
+ right_col: [m.right_val for m in matches],
49
+ "_llm_score": [m.score for m in matches],
50
+ "_llm_reasoning": [m.reasoning for m in matches],
51
+ "_embed_rank": [m.embed_rank for m in matches],
52
+ "_match_method": [m.match_method for m in matches],
53
+ })
54
+
55
+ df2_with_key = df2.merge(match_df, on=right_col, how="left")
56
+ result = df1.merge(df2_with_key, left_on=left_col, right_on=left_col, how=how)
57
+
58
+ if not return_reasoning:
59
+ result = result.drop(
60
+ columns=["_llm_score", "_llm_reasoning", "_embed_rank", "_match_method"],
61
+ errors="ignore",
62
+ )
63
+
64
+ return result.reset_index(drop=True)
llm_join/prompts.py ADDED
@@ -0,0 +1,17 @@
1
+ _TEMPLATE = """\
2
+ You are a data matching assistant.
3
+ {context_line}For each pair below, score how likely LEFT matches RIGHT (0.0–1.0).
4
+ Respond ONLY as a JSON array with no extra text:
5
+ [{{"index": 0, "score": 0.95, "reasoning": "brief explanation"}}, ...]
6
+
7
+ Pairs:
8
+ {pairs}"""
9
+
10
+
11
+ def build_prompt(left_val: str, candidates: list[str], context_str: str) -> str:
12
+ context_line = f"Context: {context_str}\n" if context_str.strip() else ""
13
+ pairs = "\n".join(
14
+ f'{i}. LEFT: "{left_val}" | RIGHT: "{c}"'
15
+ for i, c in enumerate(candidates)
16
+ )
17
+ return _TEMPLATE.format(context_line=context_line, pairs=pairs)
llm_join/retriever.py ADDED
@@ -0,0 +1,68 @@
1
+ from typing import Callable
2
+ import faiss
3
+ import numpy as np
4
+
5
+
6
+ class EmbeddingRetriever:
7
+ def __init__(self, embed_fn: Callable[[list[str]], np.ndarray]):
8
+ self._embed_fn = embed_fn
9
+
10
+ def _embed(self, texts: list[str]) -> np.ndarray:
11
+ arr = self._embed_fn(texts)
12
+ return np.array(arr, dtype="float32")
13
+
14
+ def retrieve(
15
+ self,
16
+ query_vals: list[str],
17
+ corpus_vals: list[str],
18
+ top_k: int = 5,
19
+ ) -> list[list[str]]:
20
+ if not corpus_vals:
21
+ return [[] for _ in query_vals]
22
+
23
+ top_k = min(top_k, len(corpus_vals))
24
+ corpus_vecs = self._embed(corpus_vals)
25
+ query_vecs = self._embed(query_vals)
26
+
27
+ # L2-normalize for cosine similarity via inner product
28
+ faiss.normalize_L2(corpus_vecs)
29
+ faiss.normalize_L2(query_vecs)
30
+
31
+ dim = corpus_vecs.shape[1]
32
+ index = faiss.IndexFlatIP(dim)
33
+ index.add(corpus_vecs)
34
+
35
+ _, indices = index.search(query_vecs, top_k)
36
+
37
+ return [
38
+ [corpus_vals[i] for i in row if i >= 0]
39
+ for row in indices
40
+ ]
41
+
42
+ def retrieve_with_scores(
43
+ self,
44
+ query_vals: list[str],
45
+ corpus_vals: list[str],
46
+ top_k: int = 5,
47
+ ) -> list[list[tuple[str, float]]]:
48
+ """Returns list of (candidate, cosine_score) per query, sorted by score desc."""
49
+ if not corpus_vals:
50
+ return [[] for _ in query_vals]
51
+
52
+ top_k = min(top_k, len(corpus_vals))
53
+ corpus_vecs = self._embed(corpus_vals)
54
+ query_vecs = self._embed(query_vals)
55
+
56
+ faiss.normalize_L2(corpus_vecs)
57
+ faiss.normalize_L2(query_vecs)
58
+
59
+ dim = corpus_vecs.shape[1]
60
+ index = faiss.IndexFlatIP(dim)
61
+ index.add(corpus_vecs)
62
+
63
+ scores, indices = index.search(query_vecs, top_k)
64
+
65
+ return [
66
+ [(corpus_vals[idx], float(score)) for idx, score in zip(row_indices, row_scores) if idx >= 0]
67
+ for row_indices, row_scores in zip(indices, scores)
68
+ ]
llm_join/scorer.py ADDED
@@ -0,0 +1,159 @@
1
+ import asyncio
2
+ import json
3
+ import re
4
+ import time
5
+ import warnings
6
+ from typing import Callable, Optional
7
+
8
+ from llm_join.merger import MatchResult
9
+ from llm_join.prompts import build_prompt
10
+
11
+
12
+ class LLMScorer:
13
+ def __init__(self, llm: Callable, max_retries: int = 3):
14
+ self._llm = llm
15
+ self._is_async = asyncio.iscoroutinefunction(llm)
16
+ self._max_retries = max_retries
17
+
18
+ def score(
19
+ self,
20
+ left_val: str,
21
+ candidates: list[str],
22
+ context_str: str,
23
+ threshold: float = 0.7,
24
+ ) -> Optional[list[MatchResult]]:
25
+ if self._is_async:
26
+ raise TypeError(
27
+ "LLM is async; call score_async() instead, or pass a sync callable."
28
+ )
29
+ prompt = build_prompt(left_val, candidates, context_str)
30
+ last_exc: Optional[Exception] = None
31
+ for attempt in range(self._max_retries + 1):
32
+ try:
33
+ raw = self._llm(prompt)
34
+ return self._parse(left_val, candidates, raw, threshold)
35
+ except Exception as exc:
36
+ last_exc = exc
37
+ if attempt < self._max_retries:
38
+ wait = 2 ** attempt # 1s, 2s, 4s, ...
39
+ warnings.warn(
40
+ f"LLM call failed for '{left_val}' (attempt {attempt + 1}/{self._max_retries + 1}): "
41
+ f"{exc!r}. Retrying in {wait}s.",
42
+ UserWarning,
43
+ stacklevel=2,
44
+ )
45
+ time.sleep(wait)
46
+ warnings.warn(
47
+ f"LLM call failed for '{left_val}' after {self._max_retries + 1} attempts: {last_exc!r}. "
48
+ "Falling back to top embed candidate.",
49
+ UserWarning,
50
+ stacklevel=2,
51
+ )
52
+ return None # signals LLM failure — caller applies embed fallback
53
+
54
+ async def score_async(
55
+ self,
56
+ left_val: str,
57
+ candidates: list[str],
58
+ context_str: str,
59
+ threshold: float = 0.7,
60
+ ) -> Optional[list[MatchResult]]:
61
+ prompt = build_prompt(left_val, candidates, context_str)
62
+ last_exc: Optional[Exception] = None
63
+ for attempt in range(self._max_retries + 1):
64
+ try:
65
+ if self._is_async:
66
+ raw = await self._llm(prompt)
67
+ else:
68
+ raw = self._llm(prompt)
69
+ return self._parse(left_val, candidates, raw, threshold)
70
+ except Exception as exc:
71
+ last_exc = exc
72
+ if attempt < self._max_retries:
73
+ wait = 2 ** attempt
74
+ warnings.warn(
75
+ f"LLM call failed for '{left_val}' (attempt {attempt + 1}/{self._max_retries + 1}): "
76
+ f"{exc!r}. Retrying in {wait}s.",
77
+ UserWarning,
78
+ stacklevel=2,
79
+ )
80
+ await asyncio.sleep(wait)
81
+ warnings.warn(
82
+ f"LLM call failed for '{left_val}' after {self._max_retries + 1} attempts: {last_exc!r}. "
83
+ "Falling back to top embed candidate.",
84
+ UserWarning,
85
+ stacklevel=2,
86
+ )
87
+ return None # signals LLM failure — caller applies embed fallback
88
+
89
+ def _parse(
90
+ self,
91
+ left_val: str,
92
+ candidates: list[str],
93
+ raw: str,
94
+ threshold: float,
95
+ ) -> list[MatchResult]:
96
+ try:
97
+ # strip markdown code fences if present
98
+ cleaned = (
99
+ raw.strip()
100
+ .removeprefix("```json")
101
+ .removeprefix("```")
102
+ .removesuffix("```")
103
+ .strip()
104
+ )
105
+ parsed = json.loads(cleaned)
106
+ except (json.JSONDecodeError, ValueError):
107
+ # retry: find JSON array anywhere in response
108
+ match = re.search(r'\[.*?\]', raw, re.DOTALL)
109
+ if match:
110
+ try:
111
+ parsed = json.loads(match.group())
112
+ except json.JSONDecodeError:
113
+ warnings.warn(f"LLM returned malformed JSON for '{left_val}': {raw!r}")
114
+ return []
115
+ else:
116
+ warnings.warn(f"LLM returned malformed JSON for '{left_val}': {raw!r}")
117
+ return []
118
+
119
+ if not isinstance(parsed, list):
120
+ warnings.warn(f"LLM returned non-array JSON for '{left_val}': {raw!r}")
121
+ return []
122
+
123
+ # Collect all valid scored items above threshold, deduplicated by index
124
+ seen_indices: set[int] = set()
125
+ scored = []
126
+ for item in parsed:
127
+ idx = item.get("index", -1)
128
+ if not (0 <= idx < len(candidates)):
129
+ continue
130
+ if idx in seen_indices:
131
+ continue # LLM returned duplicate index — skip
132
+ seen_indices.add(idx)
133
+ score = float(item.get("score", 0.0))
134
+ if score >= threshold:
135
+ scored.append((score, idx, item.get("reasoning", "")))
136
+
137
+ if not scored:
138
+ return []
139
+
140
+ # Find best score, return ALL candidates that tie at that score
141
+ best_score = max(s for s, _, _ in scored)
142
+ tied = [(idx, reasoning) for score, idx, reasoning in scored if score == best_score]
143
+
144
+ # Annotate reasoning when multiple candidates tie — visible in return_reasoning output
145
+ tie_note = (
146
+ f" [tied: {len(tied)} candidates scored {best_score} — all joined]"
147
+ if len(tied) > 1 else ""
148
+ )
149
+
150
+ return [
151
+ MatchResult(
152
+ left_val=left_val,
153
+ right_val=candidates[idx],
154
+ score=best_score,
155
+ reasoning=reasoning + tie_note,
156
+ embed_rank=idx,
157
+ )
158
+ for idx, reasoning in tied
159
+ ]
@@ -0,0 +1,631 @@
1
+ Metadata-Version: 2.4
2
+ Name: llm-join
3
+ Version: 0.2.0
4
+ Summary: Fuzzy join DataFrames using LLM scoring and embedding retrieval
5
+ License: MIT
6
+ Keywords: entity-resolution,fuzzy-join,llm,nlp,pandas
7
+ Requires-Python: >=3.9
8
+ Requires-Dist: faiss-cpu>=1.7
9
+ Requires-Dist: numpy>=1.23
10
+ Requires-Dist: pandas>=1.5
11
+ Provides-Extra: dev
12
+ Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
13
+ Requires-Dist: pytest>=7; extra == 'dev'
14
+ Provides-Extra: sentence-transformers
15
+ Requires-Dist: sentence-transformers>=2.2; extra == 'sentence-transformers'
16
+ Description-Content-Type: text/markdown
17
+
18
+ # llm-join
19
+
20
+ **The pandas join that understands what your data means.**
21
+
22
+ `pd.merge` joins on exact values. `llm-join` joins on *meaning* — using embeddings to find candidates and an LLM you already have to decide if they match.
23
+
24
+ ---
25
+
26
+ ## Table of Contents
27
+
28
+ - [The Problem](#the-problem)
29
+ - [Install](#install)
30
+ - [Quick Start](#quick-start)
31
+ - [Why llm-join](#why-llm-join)
32
+ - [Real-World Use Cases](#real-world-use-cases)
33
+ - [How It Works](#how-it-works)
34
+ - [Cost & Scale](#cost--scale)
35
+ - [The problem with naive LLM joins](#the-problem-with-naive-llm-joins)
36
+ - [Stage 1: Embeddings narrow the search](#stage-1-embeddings-narrow-the-search-cheap)
37
+ - [Stage 2: LLM scores only the hard cases](#stage-2-llm-scores-only-the-hard-cases-accurate)
38
+ - [Real cost example](#real-cost-example)
39
+ - [Further cost controls](#further-cost-controls)
40
+ - [Usage](#usage)
41
+ - [Basic join](#basic-join)
42
+ - [With domain context](#with-domain-context)
43
+ - [See why matches were made](#see-why-matches-were-made)
44
+ - [Control cost](#control-cost)
45
+ - [Left join (audit unmatched rows)](#left-join-audit-unmatched-rows)
46
+ - [Multi-column join key](#multi-column-join-key)
47
+ - [Chaining multiple joins](#chaining-multiple-joins)
48
+ - [Works with Any LLM](#works-with-any-llm)
49
+ - [Works with Any Embedding Function](#works-with-any-embedding-function)
50
+ - [vs. Alternatives](#vs-alternatives)
51
+ - [Parameters](#parameters)
52
+ - [License](#license)
53
+
54
+ ```python
55
+ from llm_join import fuzzy_join
56
+
57
+ result = fuzzy_join(df1, df2, left_on="vendor", right_on="supplier_name", llm=my_llm, embed_fn=my_embed)
58
+ ```
59
+
60
+ ---
61
+
62
+ ## The Problem
63
+
64
+ You have two DataFrames. Same data, different text:
65
+
66
+ | Your system | Their system |
67
+ |-------------|--------------|
68
+ | `Goldman Sachs & Co.` | `The Goldman Sachs Group Inc` |
69
+ | `Sony WH-1000XM5` | `SONY-WH1000XM5-BLK` |
70
+ | `MSFT Q4 license renewal` | `Microsoft Enterprise Agreement Q4-2024` |
71
+ | `Python programming` | `Python (language)` |
72
+
73
+ `pd.merge` returns nothing. Fuzzy string matching gets the wrong answer. You end up writing custom logic — or doing it by hand.
74
+
75
+ **llm-join solves this in one line.**
76
+
77
+ ---
78
+
79
+ ## Install
80
+
81
+ ```bash
82
+ # From GitHub (not yet on PyPI)
83
+ pip install git+https://github.com/adityabalki/llm-join.git
84
+
85
+ # Or clone and install locally
86
+ git clone https://github.com/adityabalki/llm-join.git
87
+ cd llm-join
88
+ pip install -e .
89
+
90
+ # Or copy the wheel to air-gapped machines
91
+ pip install llm_join-0.1.0-py3-none-any.whl
92
+ ```
93
+
94
+ ---
95
+
96
+ ## Quick Start
97
+
98
+ ```python
99
+ import pandas as pd
100
+ import openai
101
+ from llm_join import fuzzy_join
102
+
103
+ # Your data
104
+ df1 = pd.DataFrame({
105
+ "vendor": ["Goldman Sachs & Co.", "Amazon Web Services", "Microsoft Corp"],
106
+ "spend": [1_200_000, 890_000, 340_000]
107
+ })
108
+
109
+ df2 = pd.DataFrame({
110
+ "supplier_name": ["The Goldman Sachs Group Inc", "Amazon.com Inc.", "Microsoft Corporation"],
111
+ "category": ["Finance", "Cloud", "Software"]
112
+ })
113
+
114
+ # Wire up any LLM you already use
115
+ client = openai.OpenAI()
116
+ def llm(prompt):
117
+ return client.chat.completions.create(
118
+ model="gpt-4o-mini",
119
+ messages=[{"role": "user", "content": prompt}]
120
+ ).choices[0].message.content
121
+
122
+ # Wire up your embedding function
123
+ import numpy as np
124
+ def my_embed(texts):
125
+ response = client.embeddings.create(model="text-embedding-3-small", input=texts)
126
+ return np.array([d.embedding for d in response.data], dtype="float32")
127
+
128
+ # Join (inner by default — only rows that matched)
129
+ result = fuzzy_join(
130
+ df1, df2,
131
+ left_on="vendor",
132
+ right_on="supplier_name",
133
+ llm=llm,
134
+ embed_fn=my_embed,
135
+ context="company names — match legal entity variants and abbreviations",
136
+ how="inner", # "inner" | "left" | "right" | "outer"
137
+ )
138
+
139
+ print(result)
140
+ ```
141
+
142
+ | vendor | spend | supplier_name | category |
143
+ |---|---:|---|---|
144
+ | Goldman Sachs & Co. | 1,200,000 | The Goldman Sachs Group Inc | Finance |
145
+ | Amazon Web Services | 890,000 | Amazon.com Inc. | Cloud |
146
+ | Microsoft Corp | 340,000 | Microsoft Corporation | Software |
147
+
148
+ ---
149
+
150
+ ## Why llm-join
151
+
152
+ ### vs. `pd.merge`
153
+ Exact string match only. Fails on any variation in naming.
154
+
155
+ ### vs. fuzzy string matching (`fuzzywuzzy`, `rapidfuzz`)
156
+ Character similarity, not semantic meaning. `"iPhone 14 Pro"` vs `"iPhone 14 Pro Max"` scores high — but they are different products. `"CABLE-USBC-200CM-BLK"` vs `"USB-C charging cable 2m black"` scores near zero — even though they are the same item.
157
+
158
+ ### vs. embedding similarity alone
159
+ Fast and cheap, but no reasoning. Can't explain *why* two values match or catch false positives confidently.
160
+
161
+ ### llm-join
162
+ Embeddings narrow down candidates (fast, cheap). LLM makes the final call with context (accurate). You get the best of both.
163
+
164
+ ---
165
+
166
+ ## Real-World Use Cases
167
+
168
+ | Domain | Left table | Right table | Problem |
169
+ |--------|-----------|-------------|---------|
170
+ | **Supply chain** | Buyer catalog SKU | Supplier SKU | Match products across 50+ vendor catalogs |
171
+ | **Finance** | Expense report payee | GL account / vendor master | Reconcile transactions automatically |
172
+ | **Legal / M&A** | Contract party name | Corporate registry | Identify true legal entity |
173
+ | **Compliance** | Customer name | OFAC sanctions list | Sanctions screening at scale |
174
+ | **Retail / e-commerce** | Marketplace product listing | Master product catalog | Deduplicate listings across 50+ sellers |
175
+ | **Logistics** | Shipment description | Harmonized tariff code | Auto-classify goods at customs |
176
+ | **E-commerce** | Marketplace listing | Master product catalog | Deduplicate across platforms |
177
+ | **Research** | Author name | Citation database | Disambiguate authors |
178
+ | **Government** | Vendor name | Tax registry | Consolidate procurement spend |
179
+ | **Real estate** | Raw address input | Property records DB | Standardize and match addresses |
180
+
181
+ ---
182
+
183
+ ## How It Works
184
+
185
+ **Example:** A retailer's internal purchase orders use plain English product names. Their supplier sends a catalog with SKU codes. `pd.merge` matches nothing. llm-join bridges the gap.
186
+
187
+ ```python
188
+ orders_df = pd.DataFrame({"product_name": [
189
+ "USB-C charging cable 2m black",
190
+ "ergonomic mesh office chair",
191
+ "27-inch 4K monitor",
192
+ ], "qty": [500, 30, 12]})
193
+
194
+ catalog_df = pd.DataFrame({"sku": [
195
+ "CABLE-USBC-200CM-BLK",
196
+ "CABLE-USBA-200CM-BLK",
197
+ "CHAIR-MESH-ERG-ADJUSTABLE",
198
+ "CHAIR-TASK-FIXED-BLK",
199
+ "MON-27-4K-IPS-HDMI2",
200
+ ], "unit_price": [8.99, 6.49, 349.00, 189.00, 429.00]})
201
+
202
+ result = fuzzy_join(
203
+ orders_df, catalog_df,
204
+ left_on="product_name", right_on="sku",
205
+ llm=my_llm, embed_fn=my_embed,
206
+ context="procurement — match buyer product descriptions to supplier SKU codes",
207
+ top_k=3, threshold=0.7,
208
+ )
209
+ ```
210
+
211
+ ### Step 1 — Embed both columns
212
+
213
+ Every value is converted to a vector. No API call — pure math, milliseconds.
214
+
215
+ | Value | Meaning captured in vector |
216
+ |---|---|
217
+ | `"USB-C charging cable 2m black"` | cable / USB-C / length / color |
218
+ | `"ergonomic mesh office chair"` | seating / ergonomic / mesh |
219
+ | `"27-inch 4K monitor"` | display / size / resolution |
220
+ | `"CABLE-USBC-200CM-BLK"` | cable / USB-C / 200cm / black |
221
+ | `"CHAIR-MESH-ERG-ADJUSTABLE"` | seating / mesh / ergonomic |
222
+ | `"MON-27-4K-IPS-HDMI2"` | display / 27in / 4K |
223
+
224
+ ### Step 2 — FAISS retrieves top-K candidates (no LLM)
225
+
226
+ For each left row, faiss finds the `top_k` closest right vectors by cosine similarity. Everything else is eliminated — no LLM call needed.
227
+
228
+ **Query: `"USB-C charging cable 2m black"` → top_k=3**
229
+
230
+ | Rank | Candidate | Embed Score | Reaches LLM? |
231
+ |---:|---|---:|---|
232
+ | 0 | `CABLE-USBC-200CM-BLK` | 0.89 | ✓ yes |
233
+ | 1 | `CABLE-USBA-200CM-BLK` | 0.71 | ✓ yes |
234
+ | 2 | `CHAIR-TASK-FIXED-BLK` | 0.34 | ✓ yes (shares "BLK") |
235
+ | — | `CHAIR-MESH-ERG-ADJUSTABLE` | 0.11 | ✗ eliminated |
236
+ | — | `MON-27-4K-IPS-HDMI2` | 0.08 | ✗ eliminated |
237
+
238
+ **Query: `"ergonomic mesh office chair"` → top_k=3**
239
+
240
+ | Rank | Candidate | Embed Score | Reaches LLM? |
241
+ |---:|---|---:|---|
242
+ | 0 | `CHAIR-MESH-ERG-ADJUSTABLE` | 0.91 | ✓ yes |
243
+ | 1 | `CHAIR-TASK-FIXED-BLK` | 0.74 | ✓ yes |
244
+ | 2 | `CABLE-USBC-200CM-BLK` | 0.19 | ✓ yes |
245
+ | — | `MON-27-4K-IPS-HDMI2` | 0.06 | ✗ eliminated |
246
+ | — | `CABLE-USBA-200CM-BLK` | 0.14 | ✗ eliminated |
247
+
248
+ **Query: `"27-inch 4K monitor"` → top_k=3**
249
+
250
+ | Rank | Candidate | Embed Score | Reaches LLM? |
251
+ |---:|---|---:|---|
252
+ | 0 | `MON-27-4K-IPS-HDMI2` | 0.94 | ✓ yes |
253
+ | 1 | `CABLE-USBC-200CM-BLK` | 0.22 | ✓ yes |
254
+ | 2 | `CHAIR-TASK-FIXED-BLK` | 0.17 | ✓ yes |
255
+ | — | `CHAIR-MESH-ERG-ADJUSTABLE` | 0.09 | ✗ eliminated |
256
+ | — | `CABLE-USBA-200CM-BLK` | 0.12 | ✗ eliminated |
257
+
258
+ **Result: 3 LLM calls instead of 3 × 5 = 15 pair-by-pair calls.**
259
+
260
+ ### Step 3 — One LLM call per left row scores all candidates
261
+
262
+ All top-K candidates go into a single prompt. LLM returns a JSON array — one API call, all candidates scored.
263
+
264
+ **Prompt sent for `"USB-C charging cable 2m black"`:**
265
+ ```
266
+ Context: procurement — match buyer product descriptions to supplier SKU codes
267
+
268
+ LEFT: "USB-C charging cable 2m black"
269
+
270
+ Score each candidate (0.0–1.0):
271
+ 0. CABLE-USBC-200CM-BLK
272
+ 1. CABLE-USBA-200CM-BLK
273
+ 2. CHAIR-TASK-FIXED-BLK
274
+ ```
275
+
276
+ **LLM response:**
277
+ ```json
278
+ [
279
+ {"index": 0, "score": 0.97, "reasoning": "USBC = USB-C, 200CM = 2m, BLK = black. Exact match on all three specs."},
280
+ {"index": 1, "score": 0.38, "reasoning": "Correct length and color but USBA is USB-A, not USB-C. Wrong connector type."},
281
+ {"index": 2, "score": 0.04, "reasoning": "This is a chair SKU. No relation to a cable."}
282
+ ]
283
+ ```
284
+
285
+ **Apply threshold=0.7:**
286
+
287
+ | Candidate | LLM Score | Decision |
288
+ |---|---:|---|
289
+ | `CABLE-USBC-200CM-BLK` | 0.97 | ✓ **best match — joined** |
290
+ | `CABLE-USBA-200CM-BLK` | 0.38 | ✗ below threshold |
291
+ | `CHAIR-TASK-FIXED-BLK` | 0.04 | ✗ below threshold |
292
+
293
+ ### Step 4 — Merge matched rows
294
+
295
+ ```python
296
+ print(result)
297
+ ```
298
+
299
+ | product_name | qty | sku | unit_price |
300
+ |---|---:|---|---:|
301
+ | USB-C charging cable 2m black | 500 | CABLE-USBC-200CM-BLK | 8.99 |
302
+ | ergonomic mesh office chair | 30 | CHAIR-MESH-ERG-ADJUSTABLE | 349.00 |
303
+ | 27-inch 4K monitor | 12 | MON-27-4K-IPS-HDMI2 | 429.00 |
304
+
305
+ The LLM correctly rejected `CABLE-USBA-200CM-BLK` (wrong connector) even though embedding scored it 0.71 — this is the case where LLM reasoning earns its cost.
306
+
307
+ ---
308
+
309
+ ## Cost & Scale
310
+
311
+ ### The problem with naive LLM joins
312
+
313
+ If you sent every possible pair to the LLM:
314
+
315
+ | Left rows | Right rows | Pairs to score | Cost (gpt-4o-mini ~$0.30/1M tokens) |
316
+ |-----------|------------|----------------|--------------------------------------|
317
+ | 1,000 | 10,000 | 10,000,000 | ~$150 |
318
+ | 10,000 | 100,000 | 1,000,000,000 | ~$15,000 |
319
+ | 100,000 | 1,000,000 | 100,000,000,000 | impossible |
320
+
321
+ **llm-join solves this with a two-stage pipeline.**
322
+
323
+ ### Stage 1: Embeddings narrow the search (cheap)
324
+
325
+ Convert every value to a vector. Use faiss to find the top-K most similar candidates per row. This is pure math — no API calls, runs in milliseconds.
326
+
327
+ | | |
328
+ |---|---|
329
+ | Input pairs | 10,000 × 100,000 = **1,000,000,000** |
330
+ | After embed + faiss (`top_k=5`) | **50,000** candidate pairs |
331
+ | Eliminated for free | **99.995%** of all pairs |
332
+
333
+ ### Stage 2: LLM scores only the hard cases (accurate)
334
+
335
+ Your LLM sees a small batch of plausible candidates per row — not the full cross product. It scores each candidate; the highest score above `threshold` wins. One best match per left row.
336
+
337
+ Query: `"USB-C charging cable 2m black"`
338
+
339
+ | Rank | Candidate | LLM Score | Decision |
340
+ |---:|---|---:|---|
341
+ | 0 | `CABLE-USBC-200CM-BLK` | 0.97 | ✓ **best match — joined** |
342
+ | 1 | `CABLE-USBC-100CM-BLK` | 0.71 | scored, not selected (wrong length) |
343
+ | 2 | `CABLE-USBA-200CM-BLK` | 0.38 | scored, not selected (wrong connector) |
344
+ | 3 | `CHAIR-TASK-FIXED-BLK` | 0.04 | ✗ below threshold |
345
+ | 4 | `MON-27-4K-IPS-HDMI2` | 0.01 | ✗ below threshold |
346
+
347
+ ### Real cost example
348
+
349
+ | Setup | LLM calls | Estimated cost |
350
+ |-------|-----------|----------------|
351
+ | 10k × 100k, top_k=5 | 50,000 | ~$0.75 |
352
+ | 10k × 100k, top_k=3 | 30,000 | ~$0.45 |
353
+ | 10k × 100k, embed_threshold=0.95 | ~5,000 (obvious matches skip LLM) | ~$0.08 |
354
+
355
+ ### Further cost controls
356
+
357
+ ```python
358
+ result = fuzzy_join(
359
+ df1, df2,
360
+ left_on="vendor", right_on="supplier",
361
+ llm=my_llm,
362
+ embed_fn=my_embed,
363
+ top_k=3, # fewer candidates = fewer LLM tokens per row
364
+ embed_threshold=0.95, # skip LLM entirely if embedding match score > 0.95
365
+ max_llm_calls=1000, # hard cap — warns and returns partial result if hit
366
+ )
367
+ ```
368
+
369
+ | Parameter | Effect |
370
+ |-----------|--------|
371
+ | `top_k=3` (default 5) | 40% fewer LLM tokens |
372
+ | `embed_threshold=0.95` | Skip LLM for obvious matches — typically saves 30–60% |
373
+ | `max_llm_calls=N` | Budget guard — never exceeds N LLM calls |
374
+
375
+ ---
376
+
377
+ ## Usage
378
+
379
+ ### Basic join
380
+
381
+ ```python
382
+ result = fuzzy_join(
383
+ orders_df, catalog_df,
384
+ left_on="product_name",
385
+ right_on="sku",
386
+ llm=my_llm,
387
+ embed_fn=my_embed,
388
+ context="procurement — match buyer product descriptions to supplier SKU codes",
389
+ )
390
+ ```
391
+
392
+ ### With domain context
393
+
394
+ ```python
395
+ result = fuzzy_join(
396
+ orders_df, catalog_df,
397
+ left_on="product_name",
398
+ right_on="sku",
399
+ llm=my_llm,
400
+ embed_fn=my_embed,
401
+ context="procurement — match buyer product descriptions to supplier SKU codes",
402
+ column_context={
403
+ "product_name": "plain English product description written by a buyer",
404
+ "sku": "supplier stock-keeping unit code, typically uppercase with hyphens",
405
+ },
406
+ )
407
+ ```
408
+
409
+ ### See why matches were made
410
+
411
+ ```python
412
+ result = fuzzy_join(
413
+ df1, df2,
414
+ left_on="vendor",
415
+ right_on="supplier_name",
416
+ llm=my_llm,
417
+ embed_fn=my_embed,
418
+ return_reasoning=True,
419
+ )
420
+
421
+ print(result[["vendor", "supplier_name", "_llm_score", "_llm_reasoning", "_embed_rank", "_match_method"]])
422
+ ```
423
+
424
+ | vendor | supplier_name | _llm_score | _llm_reasoning | _embed_rank | _match_method |
425
+ |---|---|---:|---|---:|---|
426
+ | Goldman Sachs & Co. | The Goldman Sachs Group Inc | 0.97 | same firm, legal name variant | 0 | llm |
427
+
428
+ ### Control cost
429
+
430
+ ```python
431
+ result = fuzzy_join(
432
+ df1, df2,
433
+ left_on="vendor",
434
+ right_on="supplier_name",
435
+ llm=my_llm,
436
+ embed_fn=my_embed,
437
+ embed_threshold=0.95, # skip LLM if embedding match is obvious
438
+ max_llm_calls=500, # hard cap — warns and returns partial result if hit
439
+ top_k=3, # fewer candidates = fewer LLM tokens
440
+ )
441
+ ```
442
+
443
+ ### Left join (audit unmatched rows)
444
+
445
+ `how="left"` keeps all left rows — unmatched ones get NaN right columns. Useful to see what the LLM failed to match.
446
+
447
+ ```python
448
+ result = fuzzy_join(df1, df2, left_on="a", right_on="b", llm=my_llm, embed_fn=my_embed, how="left")
449
+
450
+ # Rows with no match
451
+ unmatched = result[result["b"].isna()]
452
+ ```
453
+
454
+ > **Note:** `how="outer"` is useful for reconciliation — unmatched left rows are left values with no match above threshold; unmatched right rows are right values that were never selected as a best match for any left row. `cross` join is not supported (it would be the naive O(n×m) approach that llm-join is designed to avoid).
455
+
456
+ ### Multi-column join key
457
+
458
+ ```python
459
+ # orders_df has separate "product_name" and "category" columns
460
+ # catalog_df has a single "sku_description" column like "CABLE / USB-C / 200CM / BLK"
461
+
462
+ result = fuzzy_join(
463
+ orders_df, catalog_df,
464
+ left_on=["product_name", "category"], # concatenated: "USB-C cable 2m · electronics"
465
+ right_on="sku_description",
466
+ llm=my_llm,
467
+ embed_fn=my_embed,
468
+ context="procurement — match buyer product + category to supplier SKU description",
469
+ )
470
+ ```
471
+
472
+ ### Chaining multiple joins
473
+
474
+ Each `fuzzy_join` returns a regular DataFrame — pipe them like `pd.merge`.
475
+
476
+ ```python
477
+ # df1: transactions (vendor + product columns)
478
+ # df2: vendor master
479
+ # df3: product catalog
480
+
481
+ # Step 1 — match vendors
482
+ step1 = fuzzy_join(
483
+ df1, df2,
484
+ left_on="vendor",
485
+ right_on="supplier_name",
486
+ llm=my_llm,
487
+ embed_fn=my_embed,
488
+ how="left",
489
+ return_reasoning=True,
490
+ )
491
+ step1 = step1.rename(columns={
492
+ "_llm_score": "_vendor_score",
493
+ "_llm_reasoning": "_vendor_reasoning",
494
+ "_match_method": "_vendor_method",
495
+ })
496
+
497
+ # Step 2 — match products on result of step 1
498
+ result = fuzzy_join(
499
+ step1, df3,
500
+ left_on="product",
501
+ right_on="catalog_item",
502
+ llm=my_llm,
503
+ embed_fn=my_embed,
504
+ how="left",
505
+ return_reasoning=True,
506
+ )
507
+ # result now has _vendor_score + _llm_score without column collision
508
+ ```
509
+
510
+ ---
511
+
512
+ ## Works with Any LLM
513
+
514
+ Pass any callable that takes a prompt string and returns a string.
515
+
516
+ ```python
517
+ # OpenAI
518
+ import openai
519
+ client = openai.OpenAI()
520
+ llm = lambda p: client.chat.completions.create(
521
+ model="gpt-4o-mini",
522
+ messages=[{"role": "user", "content": p}]
523
+ ).choices[0].message.content
524
+
525
+ # Anthropic
526
+ import anthropic
527
+ client = anthropic.Anthropic()
528
+ llm = lambda p: client.messages.create(
529
+ model="claude-opus-4-7", max_tokens=512,
530
+ messages=[{"role": "user", "content": p}]
531
+ ).content[0].text
532
+
533
+ # Google Gemini
534
+ import google.generativeai as genai
535
+ model = genai.GenerativeModel("gemini-2.0-flash")
536
+ llm = lambda p: model.generate_content(p).text
537
+
538
+ # Ollama (local, free)
539
+ import ollama
540
+ llm = lambda p: ollama.chat(
541
+ model="llama3.2", messages=[{"role": "user", "content": p}]
542
+ )["message"]["content"]
543
+
544
+ # Any custom endpoint
545
+ import requests
546
+ llm = lambda p: requests.post(
547
+ "https://your-llm-api.com/chat",
548
+ json={"prompt": p}
549
+ ).json()["response"]
550
+ ```
551
+
552
+ ## Works with Any Embedding Function
553
+
554
+ Pass any callable `(list[str]) -> np.ndarray` (shape `[n, dim]`, dtype `float32`).
555
+
556
+ ```python
557
+ import numpy as np
558
+
559
+ # OpenAI embeddings
560
+ import openai
561
+ client = openai.OpenAI()
562
+ def my_embed(texts):
563
+ response = client.embeddings.create(model="text-embedding-3-small", input=texts)
564
+ return np.array([d.embedding for d in response.data], dtype="float32")
565
+
566
+ # Cohere
567
+ import cohere
568
+ co = cohere.Client("YOUR_KEY")
569
+ def my_embed(texts):
570
+ response = co.embed(texts=texts, model="embed-english-v3.0", input_type="search_document")
571
+ return np.array(response.embeddings, dtype="float32")
572
+
573
+ # sentence-transformers (local)
574
+ from sentence_transformers import SentenceTransformer
575
+ model = SentenceTransformer("all-MiniLM-L6-v2")
576
+ def my_embed(texts):
577
+ return model.encode(texts, convert_to_numpy=True).astype("float32")
578
+
579
+ # Any custom endpoint
580
+ import requests
581
+ def my_embed(texts):
582
+ response = requests.post(
583
+ "https://your-embed-api.com/embed",
584
+ json={"texts": texts}
585
+ ).json()
586
+ return np.array(response["embeddings"], dtype="float32")
587
+ ```
588
+
589
+ ---
590
+
591
+ ## vs. Alternatives
592
+
593
+ | | llm-join | pd.merge | fuzzywuzzy | Jellyjoin | LinkTransformer |
594
+ |---|:---:|:---:|:---:|:---:|:---:|
595
+ | Semantic matching | ✓ | ✗ | ✗ | ✓ | ✓ |
596
+ | LLM makes final decision | ✓ | ✗ | ✗ | ✗ | optional |
597
+ | Reasoning per match | ✓ | ✗ | ✗ | ✗ | ✗ |
598
+ | Domain context injection | ✓ | ✗ | ✗ | ✗ | ✗ |
599
+ | Bring-your-own LLM callable | ✓ | n/a | n/a | ✗ | ✗ |
600
+ | Bring-your-own embed callable | ✓ | n/a | n/a | ✗ | ✗ |
601
+ | Enterprise-safe (no forced downloads) | ✓ | ✓ | ✓ | partial | ✗ |
602
+ | Hard cost cap (`max_llm_calls`) | ✓ | n/a | n/a | ✗ | ✗ |
603
+ | License | **MIT** | BSD | MIT | MIT | **GPL-3.0** |
604
+ | Install size | ~50 MB | ~20 MB | ~30 MB | ~60 MB | **~3 GB** |
605
+
606
+ ---
607
+
608
+ ## Parameters
609
+
610
+ | Parameter | Default | Description |
611
+ |-----------|---------|-------------|
612
+ | `left_on` | required | Column name(s) in df1 |
613
+ | `right_on` | required | Column name(s) in df2 |
614
+ | `llm` | required | Callable `(prompt: str) -> str` — your LLM function |
615
+ | `embed_fn` | required | Callable `(list[str]) -> np.ndarray` — your embedding function |
616
+ | `context` | required | Domain context injected into LLM prompt — describe what the columns represent and what kind of match to make |
617
+ | `column_context` | `{}` | Per-column context dict `{"col": "description"}` |
618
+ | `top_k` | `5` | Embedding candidates retrieved per row before LLM scoring |
619
+ | `batch_size` | `32` | Reserved for future LLM batching (passed through to config) |
620
+ | `threshold` | `0.7` | Minimum LLM score (0–1) to accept a match |
621
+ | `how` | `"inner"` | Join type: `inner` (matched pairs only) / `left` (all left rows, NaN where unmatched) / `right` (all right rows, NaN where no left row matched them) / `outer` (full picture — both unmatched left and unmatched right rows). |
622
+ | `embed_threshold` | `None` | Skip LLM when embedding score is decisive (saves cost) |
623
+ | `max_llm_calls` | `None` | Hard cap on LLM calls — returns partial result with warning if hit |
624
+ | `max_retries` | `3` | Retry failed LLM calls with exponential backoff (1s, 2s, 4s…). Set `0` to disable. |
625
+ | `return_reasoning` | `False` | Append `_llm_score`, `_llm_reasoning`, `_embed_rank`, `_match_method` columns. `_match_method` is `"llm"` or `"embed_threshold"` — tells you which rows skipped the LLM. |
626
+
627
+ ---
628
+
629
+ ## License
630
+
631
+ MIT
@@ -0,0 +1,10 @@
1
+ llm_join/__init__.py,sha256=6xpu-ZnwfWvtH-uOX3PY4jgAA1I5D8WSp8xL__kVyp4,85
2
+ llm_join/config.py,sha256=ne_2cY1HFGkUMTa_YSRqU2W41LytM9PQQWYR8c48j2Q,2130
3
+ llm_join/join.py,sha256=KTMzgIpdp9hypq7ahnv2XZBs0nzwkJ46vrAPHcRPgb8,4537
4
+ llm_join/merger.py,sha256=S92_aMtgxQohUzqoVeTaqQBuq3zJ29L4xbzClf-0kzg,2306
5
+ llm_join/prompts.py,sha256=soh7WKIKExgN8cSMFxkVKsOLGuYKGyTlXaLrVRF0KJQ,625
6
+ llm_join/retriever.py,sha256=mY-Rx1TWpHOqTA7dxgxIN_KzW-mhasTf9OXhx1x0Fu8,2012
7
+ llm_join/scorer.py,sha256=3zwMFfFigo6XoSDq0IgFrypRcsNd9OdGc2hW3iY-BhQ,5821
8
+ llm_join-0.2.0.dist-info/METADATA,sha256=lCji78ey-p8GhSvaWYHJLjwmV02Ms1C5f6kSOnDHNQg,21237
9
+ llm_join-0.2.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
10
+ llm_join-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any