thrifty-ml 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. thrifty_ml-0.1.0/.github/actions/uv_setup/action.yml +19 -0
  2. thrifty_ml-0.1.0/.github/workflows/release.yml +65 -0
  3. thrifty_ml-0.1.0/.gitignore +225 -0
  4. thrifty_ml-0.1.0/CLAUDE.md +60 -0
  5. thrifty_ml-0.1.0/Design.md +251 -0
  6. thrifty_ml-0.1.0/LICENSE +21 -0
  7. thrifty_ml-0.1.0/PKG-INFO +381 -0
  8. thrifty_ml-0.1.0/README.md +340 -0
  9. thrifty_ml-0.1.0/benchmarks/__init__.py +0 -0
  10. thrifty_ml-0.1.0/benchmarks/imdb/README.md +113 -0
  11. thrifty_ml-0.1.0/benchmarks/imdb/__init__.py +0 -0
  12. thrifty_ml-0.1.0/benchmarks/imdb/data.py +13 -0
  13. thrifty_ml-0.1.0/benchmarks/imdb/instrumentation.py +99 -0
  14. thrifty_ml-0.1.0/benchmarks/imdb/run.py +208 -0
  15. thrifty_ml-0.1.0/pyproject.toml +63 -0
  16. thrifty_ml-0.1.0/tests/__init__.py +0 -0
  17. thrifty_ml-0.1.0/tests/test_cli.py +127 -0
  18. thrifty_ml-0.1.0/tests/test_engine.py +148 -0
  19. thrifty_ml-0.1.0/tests/test_engine_integration.py +276 -0
  20. thrifty_ml-0.1.0/tests/test_imdb_benchmark.py +254 -0
  21. thrifty_ml-0.1.0/tests/test_proxies.py +104 -0
  22. thrifty_ml-0.1.0/tests/test_sampling.py +71 -0
  23. thrifty_ml-0.1.0/thrifty_ml/__init__.py +226 -0
  24. thrifty_ml-0.1.0/thrifty_ml/_utils.py +40 -0
  25. thrifty_ml-0.1.0/thrifty_ml/cache.py +89 -0
  26. thrifty_ml-0.1.0/thrifty_ml/cli.py +185 -0
  27. thrifty_ml-0.1.0/thrifty_ml/embeddings.py +85 -0
  28. thrifty_ml-0.1.0/thrifty_ml/engine.py +195 -0
  29. thrifty_ml-0.1.0/thrifty_ml/evaluator.py +101 -0
  30. thrifty_ml-0.1.0/thrifty_ml/llm.py +112 -0
  31. thrifty_ml-0.1.0/thrifty_ml/proxy/__init__.py +0 -0
  32. thrifty_ml-0.1.0/thrifty_ml/proxy/base.py +29 -0
  33. thrifty_ml-0.1.0/thrifty_ml/proxy/linear.py +40 -0
  34. thrifty_ml-0.1.0/thrifty_ml/proxy/trees.py +64 -0
  35. thrifty_ml-0.1.0/thrifty_ml/sampling.py +34 -0
  36. thrifty_ml-0.1.0/uv.lock +3101 -0
@@ -0,0 +1,19 @@
1
+ name: uv-install
2
+ description: Set up Python and uv
3
+
4
+ inputs:
5
+ python-version:
6
+ description: Python version, supporting MAJOR.MINOR only
7
+ required: true
8
+
9
+ env:
10
+ UV_VERSION: "0.5.25"
11
+
12
+ runs:
13
+ using: composite
14
+ steps:
15
+ - name: Install uv and set the python version
16
+ uses: astral-sh/setup-uv@v5
17
+ with:
18
+ version: ${{ env.UV_VERSION }}
19
+ python-version: ${{ inputs.python-version }}
@@ -0,0 +1,65 @@
1
+ name: Publish to TestPyPI and PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Set up Python + uv
15
+ uses: "./.github/actions/uv_setup"
16
+ with:
17
+ python-version: "3.11"
18
+
19
+ - name: Build dists
20
+ run: uv build
21
+
22
+ - name: Upload dist
23
+ uses: actions/upload-artifact@v4
24
+ with:
25
+ name: dist
26
+ path: dist/
27
+
28
+ publish-testpypi:
29
+ needs: build
30
+ runs-on: ubuntu-latest
31
+ environment:
32
+ name: testpypi
33
+ url: https://test.pypi.org/p/thrifty-ml
34
+ permissions:
35
+ id-token: write
36
+ steps:
37
+ - uses: actions/download-artifact@v4
38
+ with:
39
+ name: dist
40
+ path: dist
41
+
42
+ - name: Publish to TestPyPI
43
+ uses: pypa/gh-action-pypi-publish@release/v1
44
+ with:
45
+ repository-url: https://test.pypi.org/legacy/
46
+ verbose: true
47
+
48
+ publish-pypi:
49
+ needs: publish-testpypi
50
+ runs-on: ubuntu-latest
51
+ environment:
52
+ name: pypi
53
+ url: https://pypi.org/p/thrifty-ml
54
+ permissions:
55
+ id-token: write
56
+ steps:
57
+ - uses: actions/download-artifact@v4
58
+ with:
59
+ name: dist
60
+ path: dist
61
+
62
+ - name: Publish to PyPI
63
+ uses: pypa/gh-action-pypi-publish@release/v1
64
+ with:
65
+ verbose: true
@@ -0,0 +1,225 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ # Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ # poetry.lock
109
+ # poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ # pdm.lock
116
+ # pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ # pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # Redis
135
+ *.rdb
136
+ *.aof
137
+ *.pid
138
+
139
+ # RabbitMQ
140
+ mnesia/
141
+ rabbitmq/
142
+ rabbitmq-data/
143
+
144
+ # ActiveMQ
145
+ activemq-data/
146
+
147
+ # SageMath parsed files
148
+ *.sage.py
149
+
150
+ # Environments
151
+ .env
152
+ .envrc
153
+ .venv
154
+ env/
155
+ venv/
156
+ ENV/
157
+ env.bak/
158
+ venv.bak/
159
+
160
+ # Spyder project settings
161
+ .spyderproject
162
+ .spyproject
163
+
164
+ # Rope project settings
165
+ .ropeproject
166
+
167
+ # mkdocs documentation
168
+ /site
169
+
170
+ # mypy
171
+ .mypy_cache/
172
+ .dmypy.json
173
+ dmypy.json
174
+
175
+ # Pyre type checker
176
+ .pyre/
177
+
178
+ # pytype static type analyzer
179
+ .pytype/
180
+
181
+ # Cython debug symbols
182
+ cython_debug/
183
+
184
+ # PyCharm
185
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
186
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
187
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
188
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
189
+ # .idea/
190
+
191
+ # Abstra
192
+ # Abstra is an AI-powered process automation framework.
193
+ # Ignore directories containing user credentials, local state, and settings.
194
+ # Learn more at https://abstra.io/docs
195
+ .abstra/
196
+
197
+ # Visual Studio Code
198
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
199
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
200
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
201
+ # you could uncomment the following to ignore the entire vscode folder
202
+ # .vscode/
203
+ # Temporary file for partial code execution
204
+ tempCodeRunnerFile.py
205
+
206
+ # Ruff stuff:
207
+ .ruff_cache/
208
+
209
+ # PyPI configuration file
210
+ .pypirc
211
+
212
+ # Marimo
213
+ marimo/_static/
214
+ marimo/_lsp/
215
+ __marimo__/
216
+
217
+ # Streamlit
218
+ .streamlit/secrets.toml
219
+
220
+ # Benchmark outputs
221
+ benchmarks/imdb/results*.json
222
+ benchmarks/imdb/.cache_*/
223
+
224
+ # Claude Code
225
+ .claude/
@@ -0,0 +1,60 @@
1
+ # thrifty-ml
2
+
3
+ ## gstack
4
+
5
+ Use `/browse` from gstack for all web browsing. Never use `mcp__claude-in-chrome__*` tools.
6
+
7
+ Available gstack skills:
8
+ - `/office-hours` — office hours facilitation
9
+ - `/plan-ceo-review` — CEO review planning
10
+ - `/plan-eng-review` — engineering review planning
11
+ - `/plan-design-review` — design review planning
12
+ - `/design-consultation` — design consultation
13
+ - `/design-shotgun` — rapid design exploration
14
+ - `/design-html` — HTML design generation
15
+ - `/review` — code review
16
+ - `/ship` — ship a feature
17
+ - `/land-and-deploy` — land and deploy changes
18
+ - `/canary` — canary deployment
19
+ - `/benchmark` — benchmarking
20
+ - `/browse` — headless browser for web browsing and QA
21
+ - `/connect-chrome` — connect to Chrome
22
+ - `/qa` — QA testing
23
+ - `/qa-only` — QA without implementation
24
+ - `/design-review` — design review
25
+ - `/setup-browser-cookies` — configure browser cookies
26
+ - `/setup-deploy` — configure deployment
27
+ - `/setup-gbrain` — configure gbrain
28
+ - `/retro` — retrospective
29
+ - `/investigate` — investigation and debugging
30
+ - `/document-release` — release documentation
31
+ - `/document-generate` — documentation generation
32
+ - `/codex` — codex operations
33
+ - `/cso` — CSO operations
34
+ - `/autoplan` — automated planning
35
+ - `/plan-devex-review` — developer experience review planning
36
+ - `/devex-review` — developer experience review
37
+ - `/careful` — careful mode for risky changes
38
+ - `/freeze` — freeze deployments
39
+ - `/guard` — guard mode
40
+ - `/unfreeze` — unfreeze deployments
41
+ - `/gstack-upgrade` — upgrade gstack
42
+ - `/learn` — learning and documentation
43
+
44
+ ## Skill routing
45
+
46
+ When the user's request matches an available skill, ALWAYS invoke it using the Skill
47
+ tool as your FIRST action. Do NOT answer directly, do NOT use other tools first.
48
+ The skill has specialized workflows that produce better results than ad-hoc answers.
49
+
50
+ Key routing rules:
51
+ - Product ideas, "is this worth building", brainstorming → invoke office-hours
52
+ - Bugs, errors, "why is this broken", 500 errors → invoke investigate
53
+ - Ship, deploy, push, create PR → invoke ship
54
+ - QA, test the site, find bugs → invoke qa
55
+ - Code review, check my diff → invoke review
56
+ - Update docs after shipping → invoke document-release
57
+ - Weekly retro → invoke retro
58
+ - Design system, brand → invoke design-consultation
59
+ - Visual audit, design polish → invoke design-review
60
+ - Architecture review → invoke plan-eng-review
@@ -0,0 +1,251 @@
1
+ # thrifty-ml Design
2
+
3
+ ## What it does
4
+
5
+ thrifty-ml replaces per-row LLM calls with lightweight ML classifiers ("proxy models") trained on text embeddings. Instead of asking an LLM to evaluate every row in a DataFrame, it:
6
+
7
+ 1. Labels a small sample (~1 000 rows) using the LLM.
8
+ 2. Embeds all rows with an embedding model.
9
+ 3. Trains a fast classifier (logistic regression, SVM, or LightGBM) on the labeled sample.
10
+ 4. Evaluates classifier quality on a holdout split.
11
+ 5. If quality is good enough, uses the classifier to label the remaining rows instead of the LLM.
12
+
13
+ On large datasets this yields 100–1 000× cheaper and faster labeling with accuracy that matches the LLM within a configurable tolerance.
14
+
15
+ The technique is described in [arXiv 2603.15970](https://arxiv.org/html/2603.15970v6) and is used inside Google's BigQuery `AI.IF` and AlloyDB accelerated functions. thrifty-ml ports it to any Python DataFrame.
16
+
17
+ ---
18
+
19
+ ## Architecture
20
+
21
+ ```
22
+ thrifty_ml/
23
+ ├── __init__.py # Public API: ml_filter, ml_classify, Proxy, EmbeddingBackend
24
+ ├── engine.py # Engine — the orchestration core
25
+ ├── sampling.py # random_sample(): splits df into sample + remainder
26
+ ├── evaluator.py # evaluate(), train_holdout_split()
27
+ ├── cache.py # diskcache wrappers for embeddings and labels
28
+ ├── llm.py # Async LLM labeling via LiteLLM
29
+ ├── embeddings.py # EmbeddingBackend ABC + LiteLLMEmbeddingBackend
30
+ ├── cli.py # Typer CLI (filter, classify, embed, label, cache clear)
31
+ └── proxy/
32
+ ├── base.py # ProxyModel ABC (fit, predict, save, load)
33
+ ├── linear.py # LogisticRegressionProxy, LinearSVCProxy
34
+ └── trees.py # LightGBMProxy
35
+ ```
36
+
37
+ ---
38
+
39
+ ## Pipeline
40
+
41
+ Both online and offline modes share the same core pipeline inside `Engine`:
42
+
43
+ ```
44
+ df
45
+
46
+ ├─ embed_texts(all rows) ──────────── cache hit/miss per text ──── diskcache
47
+
48
+ ├─ random_sample(sample_size)
49
+ │ │
50
+ │ └─ label_texts(sample) ─────── LLM calls (async, batched) ── diskcache
51
+
52
+ ├─ train_holdout_split(labeled sample)
53
+ │ │ 80 % train / 20 % holdout (stratified when possible)
54
+ │ │
55
+ │ └─ proxy.fit(X_train, y_train)
56
+
57
+ ├─ evaluate(proxy, X_holdout, y_holdout)
58
+ │ │ proxy_f1 >= 1.0 - τ → use_proxy = True
59
+ │ │ otherwise emit UserWarning and fall back to LLM
60
+ │ │
61
+ │ └─ τ = fallback_threshold (default 0.1)
62
+
63
+ └─ predict remainder
64
+ if use_proxy: proxy.predict(X_remainder)
65
+ else: label_texts(remainder) ← full LLM cost
66
+ ```
67
+
68
+ **Online mode** (`ml_filter` / `ml_classify`) runs the whole pipeline and returns labels for every row.
69
+
70
+ **Offline mode** (`Proxy.fit` / `Proxy.predict`) runs up to and including `proxy.fit`, then serializes the trained model. `predict()` only embeds + classifies — no LLM calls in the hot path.
71
+
72
+ ---
73
+
74
+ ## Key components
75
+
76
+ ### Engine (`engine.py`)
77
+
78
+ Accepts a prompt, LLM model string, embedding backend, and proxy type. Coordinates all pipeline stages. Two entry points:
79
+
80
+ - `run(df, text_column)` — online, returns a label array for the full DataFrame.
81
+ - `fit(df, text_column)` — offline, returns `(proxy_model, eval_result)` for serialization.
82
+
83
+ `_run_async(coro)` is a small helper that makes the async LLM calls work in any context — plain scripts use `asyncio.run()`, Jupyter notebooks (which already have a running loop) use `nest_asyncio` if available, or fall back to a `ThreadPoolExecutor` to avoid `RuntimeError: This event loop is already running`.
84
+
85
+ ### Sampling (`sampling.py`)
86
+
87
+ `random_sample(df, n, seed)` returns `(sample_df, remainder_df)`. If `n >= len(df)` the entire DataFrame is the sample and remainder is empty (LLM labels are returned directly, no proxy needed).
88
+
89
+ ### Evaluator (`evaluator.py`)
90
+
91
+ `train_holdout_split(X, y, holdout_fraction=0.2)` uses `StratifiedShuffleSplit` when all classes have ≥ 2 samples; falls back to a random permutation when the sample is too small to stratify.
92
+
93
+ `evaluate(proxy, X_train, y_train, X_holdout, y_holdout, fallback_threshold)`:
94
+
95
+ - Fits the proxy on the train split.
96
+ - Computes `proxy_f1` using `f1_score` with `average="binary"` for two-class problems and `"macro"` for three or more. The `average` is determined from the **union** of train and holdout labels, so a class that only appears in holdout does not cause a crash.
97
+ - Compares `proxy_f1 >= 1.0 - fallback_threshold`. Because LLM labels are used as ground truth, the LLM's own F1 is trivially 1.0.
98
+ - Emits a `UserWarning` and sets `use_proxy=False` on failure.
99
+
100
+ ### LLM labeling (`llm.py`)
101
+
102
+ `label_texts(texts, prompt, model, classes, max_concurrency, cache_dir)` is an async function. It fires one coroutine per text, throttled by an `asyncio.Semaphore`. Each call:
103
+
104
+ 1. Checks the label cache.
105
+ 2. On a miss: calls `litellm.acompletion` with `response_format={"type": "json_object"}` and `temperature=0`.
106
+ 3. For binary mode (no `classes`): parses `{"label": true/false}`.
107
+ 4. For multiclass: parses `{"label": "<class>"}` and returns `"__unknown__"` if the value is not in the allowed list.
108
+ 5. Writes the result to the label cache.
109
+
110
+ Retries on transient errors are handled by `_litellm_call_with_retry` in `_utils.py`.
111
+
112
+ ### Embeddings (`embeddings.py`)
113
+
114
+ `EmbeddingBackend` is an ABC with two requirements:
115
+
116
+ - `model_id: str` — stable string used as the diskcache key.
117
+ - `embed(texts: list[str]) -> np.ndarray` — returns a float32 array of shape `(n, dim)`.
118
+
119
+ `LiteLLMEmbeddingBackend(model: str)` is the default implementation. It sends texts to any LiteLLM-supported embedding provider in chunks of 2 048.
120
+
121
+ `embed_texts(texts, backend, cache_dir)` does cache-then-fill: checks the cache for each text, collects misses, calls `backend.embed()` once for all misses, and writes results back. A warning is emitted for inputs exceeding 250 000 texts (memory risk).
122
+
123
+ Custom backends — sentence-transformers, pre-computed vectors, proprietary APIs — implement `EmbeddingBackend` and pass an instance anywhere a model string is accepted.
124
+
125
+ ### Proxy models (`proxy/`)
126
+
127
+ `ProxyModel` ABC (`base.py`) defines `fit(X, y)`, `predict(X)`, optional `predict_proba(X)`, and default `save`/`load` via `joblib`.
128
+
129
+ | Class | Backend | Imbalance handling | `save`/`load` |
130
+ |---|---|---|---|
131
+ | `LogisticRegressionProxy` | sklearn `LogisticRegression` | `class_weight="balanced"` | joblib |
132
+ | `LinearSVCProxy` | sklearn `LinearSVC` | `class_weight="balanced"` | joblib |
133
+ | `LightGBMProxy` | LightGBM `LGBMClassifier` | `is_unbalance=True` | LightGBM booster text format |
134
+
135
+ `LightGBMProxy` overrides `save`/`load` to use the LightGBM native booster format (not joblib), since joblib-serialized LightGBM objects are not portable across LightGBM versions.
136
+
137
+ ### Caching (`cache.py`)
138
+
139
+ All embeddings and labels are cached in a `diskcache.FanoutCache` (8 SQLite shards, 60 s timeout) at `~/.cache/thrifty_ml/` by default.
140
+
141
+ Cache keys are namespaced by version: `thrifty_ml/{VERSION}/...`
142
+
143
+ - **Embedding key**: `emb / sha256(text) / model_id`
144
+ - **Label key**: `lbl / sha256(text) / sha256(prompt) / model_id / classes_key`
145
+
146
+ `classes_key` is `"binary"` when no classes are provided, or `sha256(sorted(classes))` for multiclass. This ensures that cached binary labels are not reused for a multiclass query on the same text+prompt, and vice versa.
147
+
148
+ ### `Proxy` save/load (`__init__.py`)
149
+
150
+ `Proxy.save(path)` writes two files:
151
+
152
+ - `path` — the serialized proxy model (joblib or LightGBM native).
153
+ - `path.meta.json` — a sidecar: `{"proxy_type": "lr"|"svc"|"lgbm", "embedding_model": "<model_id>"}`.
154
+
155
+ `Proxy.load(path, embedding_model=None)` reads the sidecar to determine the proxy type and embedding model, then dispatches to the correct backend's `load()`. If `embedding_model` is passed explicitly it overrides the sidecar value (required for custom `EmbeddingBackend` subclasses, since only a string model ID is stored in the sidecar).
156
+
157
+ ---
158
+
159
+ ## Public API
160
+
161
+ ```python
162
+ from thrifty_ml import ml_filter, ml_classify, Proxy, EmbeddingBackend
163
+
164
+ # Online binary filter
165
+ mask = ml_filter(
166
+ df,
167
+ prompt="Is this review positive?",
168
+ text_column="review",
169
+ llm="anthropic/claude-haiku-4-5",
170
+ embedding_model="text-embedding-3-small",
171
+ proxy="lr", # "lr" | "svc" | "lgbm"
172
+ sample_size=1000,
173
+ fallback_threshold=0.1, # τ: proxy F1 must be >= 1.0 - τ
174
+ )
175
+
176
+ # Online multiclass
177
+ labels = ml_classify(
178
+ df,
179
+ prompt="Classify support ticket intent",
180
+ text_column="body",
181
+ llm="anthropic/claude-haiku-4-5",
182
+ embedding_model="text-embedding-3-small",
183
+ classes=["billing", "tech", "other"],
184
+ )
185
+
186
+ # Offline sklearn-style
187
+ proxy = Proxy(prompt="...", llm="...", embedding_model="...", model="lgbm")
188
+ proxy.fit(train_df, "text")
189
+ proxy.save("proxy.lgbm")
190
+
191
+ loaded = Proxy.load("proxy.lgbm")
192
+ preds = loaded.predict(new_df, "text") # no LLM calls
193
+
194
+ # Custom embedding backend
195
+ class MyBackend(EmbeddingBackend):
196
+ model_id = "my-model-v1"
197
+ def embed(self, texts):
198
+ ... # return np.ndarray of shape (len(texts), dim)
199
+
200
+ mask = ml_filter(df, ..., embedding_model=MyBackend())
201
+ ```
202
+
203
+ ---
204
+
205
+ ## CLI
206
+
207
+ ```
208
+ thrifty-ml filter input.parquet --prompt "..." --text-col review --out mask.parquet
209
+ thrifty-ml classify input.parquet --prompt "..." --text-col review --classes a,b,c --out labels.parquet
210
+ thrifty-ml embed input.parquet --text-col review --model text-embedding-3-small --out embeds.npy
211
+ thrifty-ml label input.parquet --prompt "..." --text-col review --sample 1000 --out labels.parquet
212
+ thrifty-ml cache clear
213
+ ```
214
+
215
+ All commands accept `--llm`, `--embedding-model`, `--proxy`, `--cache-dir`, `--sample-size`, `--fallback-threshold`, `--max-concurrency`, `--seed`. Input formats: `.parquet`, `.csv`, `.json`, `.jsonl`.
216
+
217
+ ---
218
+
219
+ ## Design decisions
220
+
221
+ **Why logistic regression as the default proxy?** Embedding models are trained to produce linearly separable representations. The paper's own ablation finds that LR almost always matches or beats more complex classifiers when using modern embeddings. LR trains in seconds on 1 000 samples and infers in < 1 ms per batch.
222
+
223
+ **Why LightGBM for non-linear tasks?** Histogram-based splits are fast on dense float32 arrays, training is low-memory, and the install is lightweight. It handles class imbalance natively via `is_unbalance=True`.
224
+
225
+ **Why a separate sidecar file for `save`/`load`?** LightGBM and sklearn use different serialization formats (LightGBM native vs joblib). Storing proxy type and embedding model in a `.meta.json` sidecar lets `Proxy.load()` self-describe and dispatch correctly without requiring callers to remember or pass the original arguments.
226
+
227
+ **Why include `classes` in the label cache key?** A binary filter (`ml_filter`) and a multiclass classifier (`ml_classify`) can share the same prompt and model. Without a `classes` component in the key, a cached binary `True`/`False` label could be silently returned for a multiclass query expecting `"billing"` / `"tech"` / `"other"`.
228
+
229
+ **Why `_run_async` instead of `asyncio.run` everywhere?** `asyncio.run()` raises `RuntimeError: This event loop is already running` inside Jupyter notebooks and async frameworks like FastAPI. `_run_async` detects a running loop and either patches it with `nest_asyncio` or offloads to a thread, making the library usable without any setup from the caller.
230
+
231
+ **Fallback threshold τ.** The paper reports that τ = 0.1 (i.e., proxy F1 ≥ 0.9 of LLM F1) covers > 95% of production use cases. The default is 0.1 but it is user-configurable. Setting τ = 0.0 means the proxy must achieve perfect F1 on the holdout, which will almost always fall back to the LLM.
232
+
233
+ ---
234
+
235
+ ## Advantages over the SQL approach (BigQuery AI.IF / AlloyDB)
236
+
237
+ The paper's technique is implemented as SQL functions inside Google's data warehouse products — `AI.IF` in BigQuery and accelerated functions in AlloyDB. That surface imposes several constraints that thrifty-ml removes.
238
+
239
+ **No infrastructure dependency.** The SQL approach requires data to be in BigQuery or AlloyDB, a GCP account, and quota. thrifty-ml works on any DataFrame — pandas, a local parquet file, a CSV — with no cloud account required. The technique is available to anyone running Python.
240
+
241
+ **Any LLM and any embedding provider.** BigQuery and AlloyDB are wired to specific Google models (Vertex AI / Gemini). thrifty-ml uses LiteLLM as the adapter layer, so the same code path works with Anthropic, OpenAI, Bedrock, Vertex, a local Ollama instance, or any LiteLLM-supported provider. The embedding backend is similarly pluggable via the `EmbeddingBackend` ABC — you can bring pre-computed vectors, a fine-tuned sentence-transformer, or a proprietary embedding API.
242
+
243
+ **Offline / deploy-once mode.** SQL functions re-run the sample-label-train pipeline on each query invocation. thrifty-ml's `Proxy` class separates `fit` from `predict`: you train once, serialize to disk (`proxy.joblib` + `.meta.json` sidecar), and deploy the classifier independently. Subsequent predictions make zero LLM calls and run at classifier speed (~0.1 ms / 1 000 rows for logistic regression).
244
+
245
+ **Observability and control.** Inside a SQL function you cannot inspect intermediate results. In thrifty-ml, `EvalResult` exposes `proxy_f1`, `llm_f1`, `use_proxy`, and `holdout_size` after every run. You can tune `fallback_threshold` explicitly, inspect the trained sklearn/LightGBM object, check holdout predictions, or step through the pipeline in a notebook. The SQL surface hides all of this.
246
+
247
+ **Integration with the Python ML ecosystem.** Because proxy models are sklearn or LightGBM objects, standard tooling applies directly — feature importances, calibration, cross-validation, SHAP explanations, model registries. None of that is available through a SQL function.
248
+
249
+ **Iterative development workflow.** A data scientist iterating on a prompt or embedding model has a much tighter feedback loop in Python — run in a notebook, inspect the sample labels, check the proxy F1, adjust, re-run — versus having to push data to a warehouse, re-run a SQL query, and parse results from a table each iteration.
250
+
251
+ The cost and latency wins described in the paper (300–1 000× reduction at 10M-row scale) transfer fully to thrifty-ml because the underlying technique is identical. The difference is that thrifty-ml makes those wins available without requiring a Google data warehouse.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Derrick Kondo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.