thrifty-ml 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thrifty_ml-0.1.0/.github/actions/uv_setup/action.yml +19 -0
- thrifty_ml-0.1.0/.github/workflows/release.yml +65 -0
- thrifty_ml-0.1.0/.gitignore +225 -0
- thrifty_ml-0.1.0/CLAUDE.md +60 -0
- thrifty_ml-0.1.0/Design.md +251 -0
- thrifty_ml-0.1.0/LICENSE +21 -0
- thrifty_ml-0.1.0/PKG-INFO +381 -0
- thrifty_ml-0.1.0/README.md +340 -0
- thrifty_ml-0.1.0/benchmarks/__init__.py +0 -0
- thrifty_ml-0.1.0/benchmarks/imdb/README.md +113 -0
- thrifty_ml-0.1.0/benchmarks/imdb/__init__.py +0 -0
- thrifty_ml-0.1.0/benchmarks/imdb/data.py +13 -0
- thrifty_ml-0.1.0/benchmarks/imdb/instrumentation.py +99 -0
- thrifty_ml-0.1.0/benchmarks/imdb/run.py +208 -0
- thrifty_ml-0.1.0/pyproject.toml +63 -0
- thrifty_ml-0.1.0/tests/__init__.py +0 -0
- thrifty_ml-0.1.0/tests/test_cli.py +127 -0
- thrifty_ml-0.1.0/tests/test_engine.py +148 -0
- thrifty_ml-0.1.0/tests/test_engine_integration.py +276 -0
- thrifty_ml-0.1.0/tests/test_imdb_benchmark.py +254 -0
- thrifty_ml-0.1.0/tests/test_proxies.py +104 -0
- thrifty_ml-0.1.0/tests/test_sampling.py +71 -0
- thrifty_ml-0.1.0/thrifty_ml/__init__.py +226 -0
- thrifty_ml-0.1.0/thrifty_ml/_utils.py +40 -0
- thrifty_ml-0.1.0/thrifty_ml/cache.py +89 -0
- thrifty_ml-0.1.0/thrifty_ml/cli.py +185 -0
- thrifty_ml-0.1.0/thrifty_ml/embeddings.py +85 -0
- thrifty_ml-0.1.0/thrifty_ml/engine.py +195 -0
- thrifty_ml-0.1.0/thrifty_ml/evaluator.py +101 -0
- thrifty_ml-0.1.0/thrifty_ml/llm.py +112 -0
- thrifty_ml-0.1.0/thrifty_ml/proxy/__init__.py +0 -0
- thrifty_ml-0.1.0/thrifty_ml/proxy/base.py +29 -0
- thrifty_ml-0.1.0/thrifty_ml/proxy/linear.py +40 -0
- thrifty_ml-0.1.0/thrifty_ml/proxy/trees.py +64 -0
- thrifty_ml-0.1.0/thrifty_ml/sampling.py +34 -0
- thrifty_ml-0.1.0/uv.lock +3101 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
name: uv-install
|
|
2
|
+
description: Set up Python and uv
|
|
3
|
+
|
|
4
|
+
inputs:
|
|
5
|
+
python-version:
|
|
6
|
+
description: Python version, supporting MAJOR.MINOR only
|
|
7
|
+
required: true
|
|
8
|
+
|
|
9
|
+
env:
|
|
10
|
+
UV_VERSION: "0.5.25"
|
|
11
|
+
|
|
12
|
+
runs:
|
|
13
|
+
using: composite
|
|
14
|
+
steps:
|
|
15
|
+
- name: Install uv and set the python version
|
|
16
|
+
uses: astral-sh/setup-uv@v5
|
|
17
|
+
with:
|
|
18
|
+
version: ${{ env.UV_VERSION }}
|
|
19
|
+
python-version: ${{ inputs.python-version }}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
name: Publish to TestPyPI and PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
build:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
|
|
14
|
+
- name: Set up Python + uv
|
|
15
|
+
uses: "./.github/actions/uv_setup"
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.11"
|
|
18
|
+
|
|
19
|
+
- name: Build dists
|
|
20
|
+
run: uv build
|
|
21
|
+
|
|
22
|
+
- name: Upload dist
|
|
23
|
+
uses: actions/upload-artifact@v4
|
|
24
|
+
with:
|
|
25
|
+
name: dist
|
|
26
|
+
path: dist/
|
|
27
|
+
|
|
28
|
+
publish-testpypi:
|
|
29
|
+
needs: build
|
|
30
|
+
runs-on: ubuntu-latest
|
|
31
|
+
environment:
|
|
32
|
+
name: testpypi
|
|
33
|
+
url: https://test.pypi.org/p/thrifty-ml
|
|
34
|
+
permissions:
|
|
35
|
+
id-token: write
|
|
36
|
+
steps:
|
|
37
|
+
- uses: actions/download-artifact@v4
|
|
38
|
+
with:
|
|
39
|
+
name: dist
|
|
40
|
+
path: dist
|
|
41
|
+
|
|
42
|
+
- name: Publish to TestPyPI
|
|
43
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
44
|
+
with:
|
|
45
|
+
repository-url: https://test.pypi.org/legacy/
|
|
46
|
+
verbose: true
|
|
47
|
+
|
|
48
|
+
publish-pypi:
|
|
49
|
+
needs: publish-testpypi
|
|
50
|
+
runs-on: ubuntu-latest
|
|
51
|
+
environment:
|
|
52
|
+
name: pypi
|
|
53
|
+
url: https://pypi.org/p/thrifty-ml
|
|
54
|
+
permissions:
|
|
55
|
+
id-token: write
|
|
56
|
+
steps:
|
|
57
|
+
- uses: actions/download-artifact@v4
|
|
58
|
+
with:
|
|
59
|
+
name: dist
|
|
60
|
+
path: dist
|
|
61
|
+
|
|
62
|
+
- name: Publish to PyPI
|
|
63
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
64
|
+
with:
|
|
65
|
+
verbose: true
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
# Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
# uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
# poetry.lock
|
|
109
|
+
# poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
# pdm.lock
|
|
116
|
+
# pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
# pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# Redis
|
|
135
|
+
*.rdb
|
|
136
|
+
*.aof
|
|
137
|
+
*.pid
|
|
138
|
+
|
|
139
|
+
# RabbitMQ
|
|
140
|
+
mnesia/
|
|
141
|
+
rabbitmq/
|
|
142
|
+
rabbitmq-data/
|
|
143
|
+
|
|
144
|
+
# ActiveMQ
|
|
145
|
+
activemq-data/
|
|
146
|
+
|
|
147
|
+
# SageMath parsed files
|
|
148
|
+
*.sage.py
|
|
149
|
+
|
|
150
|
+
# Environments
|
|
151
|
+
.env
|
|
152
|
+
.envrc
|
|
153
|
+
.venv
|
|
154
|
+
env/
|
|
155
|
+
venv/
|
|
156
|
+
ENV/
|
|
157
|
+
env.bak/
|
|
158
|
+
venv.bak/
|
|
159
|
+
|
|
160
|
+
# Spyder project settings
|
|
161
|
+
.spyderproject
|
|
162
|
+
.spyproject
|
|
163
|
+
|
|
164
|
+
# Rope project settings
|
|
165
|
+
.ropeproject
|
|
166
|
+
|
|
167
|
+
# mkdocs documentation
|
|
168
|
+
/site
|
|
169
|
+
|
|
170
|
+
# mypy
|
|
171
|
+
.mypy_cache/
|
|
172
|
+
.dmypy.json
|
|
173
|
+
dmypy.json
|
|
174
|
+
|
|
175
|
+
# Pyre type checker
|
|
176
|
+
.pyre/
|
|
177
|
+
|
|
178
|
+
# pytype static type analyzer
|
|
179
|
+
.pytype/
|
|
180
|
+
|
|
181
|
+
# Cython debug symbols
|
|
182
|
+
cython_debug/
|
|
183
|
+
|
|
184
|
+
# PyCharm
|
|
185
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
186
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
188
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
189
|
+
# .idea/
|
|
190
|
+
|
|
191
|
+
# Abstra
|
|
192
|
+
# Abstra is an AI-powered process automation framework.
|
|
193
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
194
|
+
# Learn more at https://abstra.io/docs
|
|
195
|
+
.abstra/
|
|
196
|
+
|
|
197
|
+
# Visual Studio Code
|
|
198
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
199
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
200
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
201
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
202
|
+
# .vscode/
|
|
203
|
+
# Temporary file for partial code execution
|
|
204
|
+
tempCodeRunnerFile.py
|
|
205
|
+
|
|
206
|
+
# Ruff stuff:
|
|
207
|
+
.ruff_cache/
|
|
208
|
+
|
|
209
|
+
# PyPI configuration file
|
|
210
|
+
.pypirc
|
|
211
|
+
|
|
212
|
+
# Marimo
|
|
213
|
+
marimo/_static/
|
|
214
|
+
marimo/_lsp/
|
|
215
|
+
__marimo__/
|
|
216
|
+
|
|
217
|
+
# Streamlit
|
|
218
|
+
.streamlit/secrets.toml
|
|
219
|
+
|
|
220
|
+
# Benchmark outputs
|
|
221
|
+
benchmarks/imdb/results*.json
|
|
222
|
+
benchmarks/imdb/.cache_*/
|
|
223
|
+
|
|
224
|
+
# Claude Code
|
|
225
|
+
.claude/
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# thrifty-ml
|
|
2
|
+
|
|
3
|
+
## gstack
|
|
4
|
+
|
|
5
|
+
Use `/browse` from gstack for all web browsing. Never use `mcp__claude-in-chrome__*` tools.
|
|
6
|
+
|
|
7
|
+
Available gstack skills:
|
|
8
|
+
- `/office-hours` — office hours facilitation
|
|
9
|
+
- `/plan-ceo-review` — CEO review planning
|
|
10
|
+
- `/plan-eng-review` — engineering review planning
|
|
11
|
+
- `/plan-design-review` — design review planning
|
|
12
|
+
- `/design-consultation` — design consultation
|
|
13
|
+
- `/design-shotgun` — rapid design exploration
|
|
14
|
+
- `/design-html` — HTML design generation
|
|
15
|
+
- `/review` — code review
|
|
16
|
+
- `/ship` — ship a feature
|
|
17
|
+
- `/land-and-deploy` — land and deploy changes
|
|
18
|
+
- `/canary` — canary deployment
|
|
19
|
+
- `/benchmark` — benchmarking
|
|
20
|
+
- `/browse` — headless browser for web browsing and QA
|
|
21
|
+
- `/connect-chrome` — connect to Chrome
|
|
22
|
+
- `/qa` — QA testing
|
|
23
|
+
- `/qa-only` — QA without implementation
|
|
24
|
+
- `/design-review` — design review
|
|
25
|
+
- `/setup-browser-cookies` — configure browser cookies
|
|
26
|
+
- `/setup-deploy` — configure deployment
|
|
27
|
+
- `/setup-gbrain` — configure gbrain
|
|
28
|
+
- `/retro` — retrospective
|
|
29
|
+
- `/investigate` — investigation and debugging
|
|
30
|
+
- `/document-release` — release documentation
|
|
31
|
+
- `/document-generate` — documentation generation
|
|
32
|
+
- `/codex` — codex operations
|
|
33
|
+
- `/cso` — CSO operations
|
|
34
|
+
- `/autoplan` — automated planning
|
|
35
|
+
- `/plan-devex-review` — developer experience review planning
|
|
36
|
+
- `/devex-review` — developer experience review
|
|
37
|
+
- `/careful` — careful mode for risky changes
|
|
38
|
+
- `/freeze` — freeze deployments
|
|
39
|
+
- `/guard` — guard mode
|
|
40
|
+
- `/unfreeze` — unfreeze deployments
|
|
41
|
+
- `/gstack-upgrade` — upgrade gstack
|
|
42
|
+
- `/learn` — learning and documentation
|
|
43
|
+
|
|
44
|
+
## Skill routing
|
|
45
|
+
|
|
46
|
+
When the user's request matches an available skill, ALWAYS invoke it using the Skill
|
|
47
|
+
tool as your FIRST action. Do NOT answer directly, do NOT use other tools first.
|
|
48
|
+
The skill has specialized workflows that produce better results than ad-hoc answers.
|
|
49
|
+
|
|
50
|
+
Key routing rules:
|
|
51
|
+
- Product ideas, "is this worth building", brainstorming → invoke office-hours
|
|
52
|
+
- Bugs, errors, "why is this broken", 500 errors → invoke investigate
|
|
53
|
+
- Ship, deploy, push, create PR → invoke ship
|
|
54
|
+
- QA, test the site, find bugs → invoke qa
|
|
55
|
+
- Code review, check my diff → invoke review
|
|
56
|
+
- Update docs after shipping → invoke document-release
|
|
57
|
+
- Weekly retro → invoke retro
|
|
58
|
+
- Design system, brand → invoke design-consultation
|
|
59
|
+
- Visual audit, design polish → invoke design-review
|
|
60
|
+
- Architecture review → invoke plan-eng-review
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
# thrifty-ml Design
|
|
2
|
+
|
|
3
|
+
## What it does
|
|
4
|
+
|
|
5
|
+
thrifty-ml replaces per-row LLM calls with lightweight ML classifiers ("proxy models") trained on text embeddings. Instead of asking an LLM to evaluate every row in a DataFrame, it:
|
|
6
|
+
|
|
7
|
+
1. Labels a small sample (~1 000 rows) using the LLM.
|
|
8
|
+
2. Embeds all rows with an embedding model.
|
|
9
|
+
3. Trains a fast classifier (logistic regression, SVM, or LightGBM) on the labeled sample.
|
|
10
|
+
4. Evaluates classifier quality on a holdout split.
|
|
11
|
+
5. If quality is good enough, uses the classifier to label the remaining rows instead of the LLM.
|
|
12
|
+
|
|
13
|
+
On large datasets this yields 100–1 000× cheaper and faster labeling with accuracy that matches the LLM within a configurable tolerance.
|
|
14
|
+
|
|
15
|
+
The technique is described in [arXiv 2603.15970](https://arxiv.org/html/2603.15970v6) and is used inside Google's BigQuery `AI.IF` and AlloyDB accelerated functions. thrifty-ml ports it to any Python DataFrame.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Architecture
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
thrifty_ml/
|
|
23
|
+
├── __init__.py # Public API: ml_filter, ml_classify, Proxy, EmbeddingBackend
|
|
24
|
+
├── engine.py # Engine — the orchestration core
|
|
25
|
+
├── sampling.py # random_sample(): splits df into sample + remainder
|
|
26
|
+
├── evaluator.py # evaluate(), train_holdout_split()
|
|
27
|
+
├── cache.py # diskcache wrappers for embeddings and labels
|
|
28
|
+
├── llm.py # Async LLM labeling via LiteLLM
|
|
29
|
+
├── embeddings.py # EmbeddingBackend ABC + LiteLLMEmbeddingBackend
|
|
30
|
+
├── cli.py # Typer CLI (filter, classify, embed, label, cache clear)
|
|
31
|
+
└── proxy/
|
|
32
|
+
├── base.py # ProxyModel ABC (fit, predict, save, load)
|
|
33
|
+
├── linear.py # LogisticRegressionProxy, LinearSVCProxy
|
|
34
|
+
└── trees.py # LightGBMProxy
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Pipeline
|
|
40
|
+
|
|
41
|
+
Both online and offline modes share the same core pipeline inside `Engine`:
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
df
|
|
45
|
+
│
|
|
46
|
+
├─ embed_texts(all rows) ──────────── cache hit/miss per text ──── diskcache
|
|
47
|
+
│
|
|
48
|
+
├─ random_sample(sample_size)
|
|
49
|
+
│ │
|
|
50
|
+
│ └─ label_texts(sample) ─────── LLM calls (async, batched) ── diskcache
|
|
51
|
+
│
|
|
52
|
+
├─ train_holdout_split(labeled sample)
|
|
53
|
+
│ │ 80 % train / 20 % holdout (stratified when possible)
|
|
54
|
+
│ │
|
|
55
|
+
│ └─ proxy.fit(X_train, y_train)
|
|
56
|
+
│
|
|
57
|
+
├─ evaluate(proxy, X_holdout, y_holdout)
|
|
58
|
+
│ │ proxy_f1 >= 1.0 - τ → use_proxy = True
|
|
59
|
+
│ │ otherwise emit UserWarning and fall back to LLM
|
|
60
|
+
│ │
|
|
61
|
+
│ └─ τ = fallback_threshold (default 0.1)
|
|
62
|
+
│
|
|
63
|
+
└─ predict remainder
|
|
64
|
+
if use_proxy: proxy.predict(X_remainder)
|
|
65
|
+
else: label_texts(remainder) ← full LLM cost
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**Online mode** (`ml_filter` / `ml_classify`) runs the whole pipeline and returns labels for every row.
|
|
69
|
+
|
|
70
|
+
**Offline mode** (`Proxy.fit` / `Proxy.predict`) runs up to and including `proxy.fit`, then serializes the trained model. `predict()` only embeds + classifies — no LLM calls in the hot path.
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Key components
|
|
75
|
+
|
|
76
|
+
### Engine (`engine.py`)
|
|
77
|
+
|
|
78
|
+
Accepts a prompt, LLM model string, embedding backend, and proxy type. Coordinates all pipeline stages. Two entry points:
|
|
79
|
+
|
|
80
|
+
- `run(df, text_column)` — online, returns a label array for the full DataFrame.
|
|
81
|
+
- `fit(df, text_column)` — offline, returns `(proxy_model, eval_result)` for serialization.
|
|
82
|
+
|
|
83
|
+
`_run_async(coro)` is a small helper that makes the async LLM calls work in any context — plain scripts use `asyncio.run()`, Jupyter notebooks (which already have a running loop) use `nest_asyncio` if available, or fall back to a `ThreadPoolExecutor` to avoid `RuntimeError: This event loop is already running`.
|
|
84
|
+
|
|
85
|
+
### Sampling (`sampling.py`)
|
|
86
|
+
|
|
87
|
+
`random_sample(df, n, seed)` returns `(sample_df, remainder_df)`. If `n >= len(df)` the entire DataFrame is the sample and remainder is empty (LLM labels are returned directly, no proxy needed).
|
|
88
|
+
|
|
89
|
+
### Evaluator (`evaluator.py`)
|
|
90
|
+
|
|
91
|
+
`train_holdout_split(X, y, holdout_fraction=0.2)` uses `StratifiedShuffleSplit` when all classes have ≥ 2 samples; falls back to a random permutation when the sample is too small to stratify.
|
|
92
|
+
|
|
93
|
+
`evaluate(proxy, X_train, y_train, X_holdout, y_holdout, fallback_threshold)`:
|
|
94
|
+
|
|
95
|
+
- Fits the proxy on the train split.
|
|
96
|
+
- Computes `proxy_f1` using `f1_score` with `average="binary"` for two-class problems and `"macro"` for three or more. The `average` is determined from the **union** of train and holdout labels, so a class that only appears in holdout does not cause a crash.
|
|
97
|
+
- Compares `proxy_f1 >= 1.0 - fallback_threshold`. Because LLM labels are used as ground truth, the LLM's own F1 is trivially 1.0.
|
|
98
|
+
- Emits a `UserWarning` and sets `use_proxy=False` on failure.
|
|
99
|
+
|
|
100
|
+
### LLM labeling (`llm.py`)
|
|
101
|
+
|
|
102
|
+
`label_texts(texts, prompt, model, classes, max_concurrency, cache_dir)` is an async function. It fires one coroutine per text, throttled by an `asyncio.Semaphore`. Each call:
|
|
103
|
+
|
|
104
|
+
1. Checks the label cache.
|
|
105
|
+
2. On a miss: calls `litellm.acompletion` with `response_format={"type": "json_object"}` and `temperature=0`.
|
|
106
|
+
3. For binary mode (no `classes`): parses `{"label": true/false}`.
|
|
107
|
+
4. For multiclass: parses `{"label": "<class>"}` and returns `"__unknown__"` if the value is not in the allowed list.
|
|
108
|
+
5. Writes the result to the label cache.
|
|
109
|
+
|
|
110
|
+
Retries on transient errors are handled by `_litellm_call_with_retry` in `_utils.py`.
|
|
111
|
+
|
|
112
|
+
### Embeddings (`embeddings.py`)
|
|
113
|
+
|
|
114
|
+
`EmbeddingBackend` is an ABC with two requirements:
|
|
115
|
+
|
|
116
|
+
- `model_id: str` — stable string used as the diskcache key.
|
|
117
|
+
- `embed(texts: list[str]) -> np.ndarray` — returns a float32 array of shape `(n, dim)`.
|
|
118
|
+
|
|
119
|
+
`LiteLLMEmbeddingBackend(model: str)` is the default implementation. It sends texts to any LiteLLM-supported embedding provider in chunks of 2 048.
|
|
120
|
+
|
|
121
|
+
`embed_texts(texts, backend, cache_dir)` does cache-then-fill: checks the cache for each text, collects misses, calls `backend.embed()` once for all misses, and writes results back. A warning is emitted for inputs exceeding 250 000 texts (memory risk).
|
|
122
|
+
|
|
123
|
+
Custom backends — sentence-transformers, pre-computed vectors, proprietary APIs — implement `EmbeddingBackend` and pass an instance anywhere a model string is accepted.
|
|
124
|
+
|
|
125
|
+
### Proxy models (`proxy/`)
|
|
126
|
+
|
|
127
|
+
`ProxyModel` ABC (`base.py`) defines `fit(X, y)`, `predict(X)`, optional `predict_proba(X)`, and default `save`/`load` via `joblib`.
|
|
128
|
+
|
|
129
|
+
| Class | Backend | Imbalance handling | `save`/`load` |
|
|
130
|
+
|---|---|---|---|
|
|
131
|
+
| `LogisticRegressionProxy` | sklearn `LogisticRegression` | `class_weight="balanced"` | joblib |
|
|
132
|
+
| `LinearSVCProxy` | sklearn `LinearSVC` | `class_weight="balanced"` | joblib |
|
|
133
|
+
| `LightGBMProxy` | LightGBM `LGBMClassifier` | `is_unbalance=True` | LightGBM booster text format |
|
|
134
|
+
|
|
135
|
+
`LightGBMProxy` overrides `save`/`load` to use the LightGBM native booster format (not joblib), since joblib-serialized LightGBM objects are not portable across LightGBM versions.
|
|
136
|
+
|
|
137
|
+
### Caching (`cache.py`)
|
|
138
|
+
|
|
139
|
+
All embeddings and labels are cached in a `diskcache.FanoutCache` (8 SQLite shards, 60 s timeout) at `~/.cache/thrifty_ml/` by default.
|
|
140
|
+
|
|
141
|
+
Cache keys are namespaced by version: `thrifty_ml/{VERSION}/...`
|
|
142
|
+
|
|
143
|
+
- **Embedding key**: `emb / sha256(text) / model_id`
|
|
144
|
+
- **Label key**: `lbl / sha256(text) / sha256(prompt) / model_id / classes_key`
|
|
145
|
+
|
|
146
|
+
`classes_key` is `"binary"` when no classes are provided, or `sha256(sorted(classes))` for multiclass. This ensures that cached binary labels are not reused for a multiclass query on the same text+prompt, and vice versa.
|
|
147
|
+
|
|
148
|
+
### `Proxy` save/load (`__init__.py`)
|
|
149
|
+
|
|
150
|
+
`Proxy.save(path)` writes two files:
|
|
151
|
+
|
|
152
|
+
- `path` — the serialized proxy model (joblib or LightGBM native).
|
|
153
|
+
- `path.meta.json` — a sidecar: `{"proxy_type": "lr"|"svc"|"lgbm", "embedding_model": "<model_id>"}`.
|
|
154
|
+
|
|
155
|
+
`Proxy.load(path, embedding_model=None)` reads the sidecar to determine the proxy type and embedding model, then dispatches to the correct backend's `load()`. If `embedding_model` is passed explicitly it overrides the sidecar value (required for custom `EmbeddingBackend` subclasses, since only a string model ID is stored in the sidecar).
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
## Public API
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
from thrifty_ml import ml_filter, ml_classify, Proxy, EmbeddingBackend
|
|
163
|
+
|
|
164
|
+
# Online binary filter
|
|
165
|
+
mask = ml_filter(
|
|
166
|
+
df,
|
|
167
|
+
prompt="Is this review positive?",
|
|
168
|
+
text_column="review",
|
|
169
|
+
llm="anthropic/claude-haiku-4-5",
|
|
170
|
+
embedding_model="text-embedding-3-small",
|
|
171
|
+
proxy="lr", # "lr" | "svc" | "lgbm"
|
|
172
|
+
sample_size=1000,
|
|
173
|
+
fallback_threshold=0.1, # τ: proxy F1 must be >= 1.0 - τ
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Online multiclass
|
|
177
|
+
labels = ml_classify(
|
|
178
|
+
df,
|
|
179
|
+
prompt="Classify support ticket intent",
|
|
180
|
+
text_column="body",
|
|
181
|
+
llm="anthropic/claude-haiku-4-5",
|
|
182
|
+
embedding_model="text-embedding-3-small",
|
|
183
|
+
classes=["billing", "tech", "other"],
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Offline sklearn-style
|
|
187
|
+
proxy = Proxy(prompt="...", llm="...", embedding_model="...", model="lgbm")
|
|
188
|
+
proxy.fit(train_df, "text")
|
|
189
|
+
proxy.save("proxy.lgbm")
|
|
190
|
+
|
|
191
|
+
loaded = Proxy.load("proxy.lgbm")
|
|
192
|
+
preds = loaded.predict(new_df, "text") # no LLM calls
|
|
193
|
+
|
|
194
|
+
# Custom embedding backend
|
|
195
|
+
class MyBackend(EmbeddingBackend):
|
|
196
|
+
model_id = "my-model-v1"
|
|
197
|
+
def embed(self, texts):
|
|
198
|
+
... # return np.ndarray of shape (len(texts), dim)
|
|
199
|
+
|
|
200
|
+
mask = ml_filter(df, ..., embedding_model=MyBackend())
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## CLI
|
|
206
|
+
|
|
207
|
+
```
|
|
208
|
+
thrifty-ml filter input.parquet --prompt "..." --text-col review --out mask.parquet
|
|
209
|
+
thrifty-ml classify input.parquet --prompt "..." --text-col review --classes a,b,c --out labels.parquet
|
|
210
|
+
thrifty-ml embed input.parquet --text-col review --model text-embedding-3-small --out embeds.npy
|
|
211
|
+
thrifty-ml label input.parquet --prompt "..." --text-col review --sample 1000 --out labels.parquet
|
|
212
|
+
thrifty-ml cache clear
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
All commands accept `--llm`, `--embedding-model`, `--proxy`, `--cache-dir`, `--sample-size`, `--fallback-threshold`, `--max-concurrency`, `--seed`. Input formats: `.parquet`, `.csv`, `.json`, `.jsonl`.
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Design decisions
|
|
220
|
+
|
|
221
|
+
**Why logistic regression as the default proxy?** Embedding models are trained to produce linearly separable representations. The paper's own ablation finds that LR almost always matches or beats more complex classifiers when using modern embeddings. LR trains in seconds on 1 000 samples and infers in < 1 ms per batch.
|
|
222
|
+
|
|
223
|
+
**Why LightGBM for non-linear tasks?** Histogram-based splits are fast on dense float32 arrays, training is low-memory, and the install is lightweight. It handles class imbalance natively via `is_unbalance=True`.
|
|
224
|
+
|
|
225
|
+
**Why a separate sidecar file for `save`/`load`?** LightGBM and sklearn use different serialization formats (LightGBM native vs joblib). Storing proxy type and embedding model in a `.meta.json` sidecar lets `Proxy.load()` self-describe and dispatch correctly without requiring callers to remember or pass the original arguments.
|
|
226
|
+
|
|
227
|
+
**Why include `classes` in the label cache key?** A binary filter (`ml_filter`) and a multiclass classifier (`ml_classify`) can share the same prompt and model. Without a `classes` component in the key, a cached binary `True`/`False` label could be silently returned for a multiclass query expecting `"billing"` / `"tech"` / `"other"`.
|
|
228
|
+
|
|
229
|
+
**Why `_run_async` instead of `asyncio.run` everywhere?** `asyncio.run()` raises `RuntimeError: This event loop is already running` inside Jupyter notebooks and async frameworks like FastAPI. `_run_async` detects a running loop and either patches it with `nest_asyncio` or offloads to a thread, making the library usable without any setup from the caller.
|
|
230
|
+
|
|
231
|
+
**Fallback threshold τ.** The paper reports that τ = 0.1 (i.e., proxy F1 ≥ 0.9 of LLM F1) covers > 95% of production use cases. The default is 0.1 but it is user-configurable. Setting τ = 0.0 means the proxy must achieve perfect F1 on the holdout, which will almost always fall back to the LLM.
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## Advantages over the SQL approach (BigQuery AI.IF / AlloyDB)
|
|
236
|
+
|
|
237
|
+
The paper's technique is implemented as SQL functions inside Google's data warehouse products — `AI.IF` in BigQuery and accelerated functions in AlloyDB. That surface imposes several constraints that thrifty-ml removes.
|
|
238
|
+
|
|
239
|
+
**No infrastructure dependency.** The SQL approach requires data to be in BigQuery or AlloyDB, a GCP account, and quota. thrifty-ml works on any DataFrame — pandas, a local parquet file, a CSV — with no cloud account required. The technique is available to anyone running Python.
|
|
240
|
+
|
|
241
|
+
**Any LLM and any embedding provider.** BigQuery and AlloyDB are wired to specific Google models (Vertex AI / Gemini). thrifty-ml uses LiteLLM as the adapter layer, so the same code path works with Anthropic, OpenAI, Bedrock, Vertex, a local Ollama instance, or any LiteLLM-supported provider. The embedding backend is similarly pluggable via the `EmbeddingBackend` ABC — you can bring pre-computed vectors, a fine-tuned sentence-transformer, or a proprietary embedding API.
|
|
242
|
+
|
|
243
|
+
**Offline / deploy-once mode.** SQL functions re-run the sample-label-train pipeline on each query invocation. thrifty-ml's `Proxy` class separates `fit` from `predict`: you train once, serialize to disk (`proxy.joblib` + `.meta.json` sidecar), and deploy the classifier independently. Subsequent predictions make zero LLM calls and run at classifier speed (~0.1 ms / 1 000 rows for logistic regression).
|
|
244
|
+
|
|
245
|
+
**Observability and control.** Inside a SQL function you cannot inspect intermediate results. In thrifty-ml, `EvalResult` exposes `proxy_f1`, `llm_f1`, `use_proxy`, and `holdout_size` after every run. You can tune `fallback_threshold` explicitly, inspect the trained sklearn/LightGBM object, check holdout predictions, or step through the pipeline in a notebook. The SQL surface hides all of this.
|
|
246
|
+
|
|
247
|
+
**Integration with the Python ML ecosystem.** Because proxy models are sklearn or LightGBM objects, standard tooling applies directly — feature importances, calibration, cross-validation, SHAP explanations, model registries. None of that is available through a SQL function.
|
|
248
|
+
|
|
249
|
+
**Iterative development workflow.** A data scientist iterating on a prompt or embedding model has a much tighter feedback loop in Python — run in a notebook, inspect the sample labels, check the proxy F1, adjust, re-run — versus having to push data to a warehouse, re-run a SQL query, and parse results from a table each iteration.
|
|
250
|
+
|
|
251
|
+
The cost and latency wins described in the paper (300–1 000× reduction at 10M-row scale) transfer fully to thrifty-ml because the underlying technique is identical. The difference is that thrifty-ml makes those wins available without requiring a Google data warehouse.
|
thrifty_ml-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Derrick Kondo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|