llm-model-diff 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_model_diff-0.1.0/PKG-INFO +113 -0
- llm_model_diff-0.1.0/README.md +80 -0
- llm_model_diff-0.1.0/pyproject.toml +73 -0
- llm_model_diff-0.1.0/setup.cfg +4 -0
- llm_model_diff-0.1.0/src/llm_model_diff.egg-info/PKG-INFO +113 -0
- llm_model_diff-0.1.0/src/llm_model_diff.egg-info/SOURCES.txt +15 -0
- llm_model_diff-0.1.0/src/llm_model_diff.egg-info/dependency_links.txt +1 -0
- llm_model_diff-0.1.0/src/llm_model_diff.egg-info/entry_points.txt +2 -0
- llm_model_diff-0.1.0/src/llm_model_diff.egg-info/requires.txt +12 -0
- llm_model_diff-0.1.0/src/llm_model_diff.egg-info/top_level.txt +1 -0
- llm_model_diff-0.1.0/src/model_diff/__init__.py +7 -0
- llm_model_diff-0.1.0/src/model_diff/cli.py +154 -0
- llm_model_diff-0.1.0/src/model_diff/differ.py +417 -0
- llm_model_diff-0.1.0/src/model_diff/models.py +258 -0
- llm_model_diff-0.1.0/tests/test_cli.py +178 -0
- llm_model_diff-0.1.0/tests/test_differ.py +202 -0
- llm_model_diff-0.1.0/tests/test_models.py +244 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-model-diff
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Compare LLM model outputs side-by-side with rich diff visualization
|
|
5
|
+
Author: model-diff contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: llm,ai,diff,comparison,openai,anthropic,cli
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Environment :: Console
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Classifier: Topic :: Utilities
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: click>=8.0
|
|
23
|
+
Requires-Dist: rich>=13.0
|
|
24
|
+
Requires-Dist: anthropic>=0.20.0
|
|
25
|
+
Requires-Dist: openai>=1.0.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
29
|
+
Requires-Dist: black>=23.0; extra == "dev"
|
|
30
|
+
Requires-Dist: isort>=5.0; extra == "dev"
|
|
31
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
32
|
+
Requires-Dist: flake8>=6.0; extra == "dev"
|
|
33
|
+
|
|
34
|
+
# model-diff
|
|
35
|
+
|
|
36
|
+
Compare LLM model outputs side-by-side with rich diff visualization.
|
|
37
|
+
|
|
38
|
+
Run the same prompt on multiple models simultaneously and see exactly what each model says differently.
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install model-diff
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Or install from source:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
git clone https://github.com/yourname/model-diff
|
|
50
|
+
cd model-diff
|
|
51
|
+
pip install -e .
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Requirements
|
|
55
|
+
|
|
56
|
+
Set the API keys for the providers you want to use:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
export OPENAI_API_KEY=sk-...
|
|
60
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Missing keys are handled gracefully — models without a key are skipped with a warning.
|
|
64
|
+
|
|
65
|
+
## Usage
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# Default: compare GPT-4o vs Claude Sonnet
|
|
69
|
+
model-diff "What is the best way to handle errors in Python?"
|
|
70
|
+
|
|
71
|
+
# Specify models explicitly
|
|
72
|
+
model-diff "Explain recursion" --models gpt-4o,claude-sonnet-4-6
|
|
73
|
+
|
|
74
|
+
# Use a prompt file
|
|
75
|
+
model-diff --prompt prompt.txt --models gpt-4o,claude-haiku-4-5-20251001,claude-sonnet-4-6
|
|
76
|
+
|
|
77
|
+
# Word-level diff
|
|
78
|
+
model-diff "Explain recursion" --diff words
|
|
79
|
+
|
|
80
|
+
# Show only differences (hide matching sections)
|
|
81
|
+
model-diff "Explain recursion" --only-diff
|
|
82
|
+
|
|
83
|
+
# Deterministic outputs
|
|
84
|
+
model-diff "Explain recursion" --temperature 0.0
|
|
85
|
+
|
|
86
|
+
# Save results to JSON
|
|
87
|
+
model-diff "Explain recursion" --output results.json
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Supported Models
|
|
91
|
+
|
|
92
|
+
| Model ID | Provider | API Key |
|
|
93
|
+
|---|---|---|
|
|
94
|
+
| `gpt-4o` | OpenAI | `OPENAI_API_KEY` |
|
|
95
|
+
| `gpt-4o-mini` | OpenAI | `OPENAI_API_KEY` |
|
|
96
|
+
| `claude-opus-4-6` | Anthropic | `ANTHROPIC_API_KEY` |
|
|
97
|
+
| `claude-sonnet-4-6` | Anthropic | `ANTHROPIC_API_KEY` |
|
|
98
|
+
| `claude-haiku-4-5-20251001` | Anthropic | `ANTHROPIC_API_KEY` |
|
|
99
|
+
|
|
100
|
+
## Architecture
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
src/model_diff/
|
|
104
|
+
├── cli.py # Click-based CLI entry point
|
|
105
|
+
├── models.py # Provider-specific API callers, run concurrently via threading
|
|
106
|
+
└── differ.py # difflib-based diff engine + Rich output formatter
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Model calls are issued concurrently using `threading`, so wall time equals the slowest model rather than the sum of all models.
|
|
110
|
+
|
|
111
|
+
## License
|
|
112
|
+
|
|
113
|
+
MIT
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# model-diff
|
|
2
|
+
|
|
3
|
+
Compare LLM model outputs side-by-side with rich diff visualization.
|
|
4
|
+
|
|
5
|
+
Run the same prompt on multiple models simultaneously and see exactly what each model says differently.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install model-diff
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Or install from source:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
git clone https://github.com/yourname/model-diff
|
|
17
|
+
cd model-diff
|
|
18
|
+
pip install -e .
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Requirements
|
|
22
|
+
|
|
23
|
+
Set the API keys for the providers you want to use:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
export OPENAI_API_KEY=sk-...
|
|
27
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Missing keys are handled gracefully — models without a key are skipped with a warning.
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
# Default: compare GPT-4o vs Claude Sonnet
|
|
36
|
+
model-diff "What is the best way to handle errors in Python?"
|
|
37
|
+
|
|
38
|
+
# Specify models explicitly
|
|
39
|
+
model-diff "Explain recursion" --models gpt-4o,claude-sonnet-4-6
|
|
40
|
+
|
|
41
|
+
# Use a prompt file
|
|
42
|
+
model-diff --prompt prompt.txt --models gpt-4o,claude-haiku-4-5-20251001,claude-sonnet-4-6
|
|
43
|
+
|
|
44
|
+
# Word-level diff
|
|
45
|
+
model-diff "Explain recursion" --diff words
|
|
46
|
+
|
|
47
|
+
# Show only differences (hide matching sections)
|
|
48
|
+
model-diff "Explain recursion" --only-diff
|
|
49
|
+
|
|
50
|
+
# Deterministic outputs
|
|
51
|
+
model-diff "Explain recursion" --temperature 0.0
|
|
52
|
+
|
|
53
|
+
# Save results to JSON
|
|
54
|
+
model-diff "Explain recursion" --output results.json
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Supported Models
|
|
58
|
+
|
|
59
|
+
| Model ID | Provider | API Key |
|
|
60
|
+
|---|---|---|
|
|
61
|
+
| `gpt-4o` | OpenAI | `OPENAI_API_KEY` |
|
|
62
|
+
| `gpt-4o-mini` | OpenAI | `OPENAI_API_KEY` |
|
|
63
|
+
| `claude-opus-4-6` | Anthropic | `ANTHROPIC_API_KEY` |
|
|
64
|
+
| `claude-sonnet-4-6` | Anthropic | `ANTHROPIC_API_KEY` |
|
|
65
|
+
| `claude-haiku-4-5-20251001` | Anthropic | `ANTHROPIC_API_KEY` |
|
|
66
|
+
|
|
67
|
+
## Architecture
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
src/model_diff/
|
|
71
|
+
├── cli.py # Click-based CLI entry point
|
|
72
|
+
├── models.py # Provider-specific API callers, run concurrently via threading
|
|
73
|
+
└── differ.py # difflib-based diff engine + Rich output formatter
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Model calls are issued concurrently using `threading`, so wall time equals the slowest model rather than the sum of all models.
|
|
77
|
+
|
|
78
|
+
## License
|
|
79
|
+
|
|
80
|
+
MIT
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "llm-model-diff"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Compare LLM model outputs side-by-side with rich diff visualization"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
authors = [{ name = "model-diff contributors" }]
|
|
12
|
+
requires-python = ">=3.8"
|
|
13
|
+
keywords = ["llm", "ai", "diff", "comparison", "openai", "anthropic", "cli"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Environment :: Console",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.8",
|
|
21
|
+
"Programming Language :: Python :: 3.9",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
26
|
+
"Topic :: Utilities",
|
|
27
|
+
]
|
|
28
|
+
dependencies = [
|
|
29
|
+
"click>=8.0",
|
|
30
|
+
"rich>=13.0",
|
|
31
|
+
"anthropic>=0.20.0",
|
|
32
|
+
"openai>=1.0.0",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.optional-dependencies]
|
|
36
|
+
dev = [
|
|
37
|
+
"pytest>=7.0",
|
|
38
|
+
"pytest-cov>=4.0",
|
|
39
|
+
"black>=23.0",
|
|
40
|
+
"isort>=5.0",
|
|
41
|
+
"mypy>=1.0",
|
|
42
|
+
"flake8>=6.0",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[project.scripts]
|
|
46
|
+
llm-model-diff = "model_diff.cli:main"
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.packages.find]
|
|
49
|
+
where = ["src"]
|
|
50
|
+
|
|
51
|
+
[tool.setuptools.package-dir]
|
|
52
|
+
"" = "src"
|
|
53
|
+
|
|
54
|
+
[tool.black]
|
|
55
|
+
line-length = 88
|
|
56
|
+
target-version = ["py38"]
|
|
57
|
+
|
|
58
|
+
[tool.isort]
|
|
59
|
+
profile = "black"
|
|
60
|
+
line_length = 88
|
|
61
|
+
|
|
62
|
+
[tool.mypy]
|
|
63
|
+
python_version = "3.8"
|
|
64
|
+
warn_return_any = true
|
|
65
|
+
warn_unused_configs = true
|
|
66
|
+
disallow_untyped_defs = true
|
|
67
|
+
ignore_missing_imports = true
|
|
68
|
+
|
|
69
|
+
[tool.pytest.ini_options]
|
|
70
|
+
testpaths = ["tests"]
|
|
71
|
+
python_files = ["test_*.py", "*_test.py"]
|
|
72
|
+
python_classes = ["Test*"]
|
|
73
|
+
python_functions = ["test_*"]
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-model-diff
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Compare LLM model outputs side-by-side with rich diff visualization
|
|
5
|
+
Author: model-diff contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: llm,ai,diff,comparison,openai,anthropic,cli
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Environment :: Console
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Classifier: Topic :: Utilities
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: click>=8.0
|
|
23
|
+
Requires-Dist: rich>=13.0
|
|
24
|
+
Requires-Dist: anthropic>=0.20.0
|
|
25
|
+
Requires-Dist: openai>=1.0.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
29
|
+
Requires-Dist: black>=23.0; extra == "dev"
|
|
30
|
+
Requires-Dist: isort>=5.0; extra == "dev"
|
|
31
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
32
|
+
Requires-Dist: flake8>=6.0; extra == "dev"
|
|
33
|
+
|
|
34
|
+
# model-diff
|
|
35
|
+
|
|
36
|
+
Compare LLM model outputs side-by-side with rich diff visualization.
|
|
37
|
+
|
|
38
|
+
Run the same prompt on multiple models simultaneously and see exactly what each model says differently.
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install model-diff
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Or install from source:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
git clone https://github.com/yourname/model-diff
|
|
50
|
+
cd model-diff
|
|
51
|
+
pip install -e .
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Requirements
|
|
55
|
+
|
|
56
|
+
Set the API keys for the providers you want to use:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
export OPENAI_API_KEY=sk-...
|
|
60
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Missing keys are handled gracefully — models without a key are skipped with a warning.
|
|
64
|
+
|
|
65
|
+
## Usage
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# Default: compare GPT-4o vs Claude Sonnet
|
|
69
|
+
model-diff "What is the best way to handle errors in Python?"
|
|
70
|
+
|
|
71
|
+
# Specify models explicitly
|
|
72
|
+
model-diff "Explain recursion" --models gpt-4o,claude-sonnet-4-6
|
|
73
|
+
|
|
74
|
+
# Use a prompt file
|
|
75
|
+
model-diff --prompt prompt.txt --models gpt-4o,claude-haiku-4-5-20251001,claude-sonnet-4-6
|
|
76
|
+
|
|
77
|
+
# Word-level diff
|
|
78
|
+
model-diff "Explain recursion" --diff words
|
|
79
|
+
|
|
80
|
+
# Show only differences (hide matching sections)
|
|
81
|
+
model-diff "Explain recursion" --only-diff
|
|
82
|
+
|
|
83
|
+
# Deterministic outputs
|
|
84
|
+
model-diff "Explain recursion" --temperature 0.0
|
|
85
|
+
|
|
86
|
+
# Save results to JSON
|
|
87
|
+
model-diff "Explain recursion" --output results.json
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Supported Models
|
|
91
|
+
|
|
92
|
+
| Model ID | Provider | API Key |
|
|
93
|
+
|---|---|---|
|
|
94
|
+
| `gpt-4o` | OpenAI | `OPENAI_API_KEY` |
|
|
95
|
+
| `gpt-4o-mini` | OpenAI | `OPENAI_API_KEY` |
|
|
96
|
+
| `claude-opus-4-6` | Anthropic | `ANTHROPIC_API_KEY` |
|
|
97
|
+
| `claude-sonnet-4-6` | Anthropic | `ANTHROPIC_API_KEY` |
|
|
98
|
+
| `claude-haiku-4-5-20251001` | Anthropic | `ANTHROPIC_API_KEY` |
|
|
99
|
+
|
|
100
|
+
## Architecture
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
src/model_diff/
|
|
104
|
+
├── cli.py # Click-based CLI entry point
|
|
105
|
+
├── models.py # Provider-specific API callers, run concurrently via threading
|
|
106
|
+
└── differ.py # difflib-based diff engine + Rich output formatter
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Model calls are issued concurrently using `threading`, so wall time equals the slowest model rather than the sum of all models.
|
|
110
|
+
|
|
111
|
+
## License
|
|
112
|
+
|
|
113
|
+
MIT
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/llm_model_diff.egg-info/PKG-INFO
|
|
4
|
+
src/llm_model_diff.egg-info/SOURCES.txt
|
|
5
|
+
src/llm_model_diff.egg-info/dependency_links.txt
|
|
6
|
+
src/llm_model_diff.egg-info/entry_points.txt
|
|
7
|
+
src/llm_model_diff.egg-info/requires.txt
|
|
8
|
+
src/llm_model_diff.egg-info/top_level.txt
|
|
9
|
+
src/model_diff/__init__.py
|
|
10
|
+
src/model_diff/cli.py
|
|
11
|
+
src/model_diff/differ.py
|
|
12
|
+
src/model_diff/models.py
|
|
13
|
+
tests/test_cli.py
|
|
14
|
+
tests/test_differ.py
|
|
15
|
+
tests/test_models.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
model_diff
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""model-diff: Compare LLM model outputs side-by-side with rich diff visualization."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
__all__ = ["ModelRunner", "DiffEngine", "ModelResult"]
|
|
5
|
+
|
|
6
|
+
from model_diff.models import ModelResult, ModelRunner
|
|
7
|
+
from model_diff.differ import DiffEngine
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""CLI entry point for model-diff."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
import click
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
|
|
12
|
+
from model_diff.models import DEFAULT_MODELS, ALL_SUPPORTED_MODELS, ModelRunner
|
|
13
|
+
from model_diff.differ import DiffEngine
|
|
14
|
+
|
|
15
|
+
console = Console()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# Helpers
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
def _parse_models(models_str: str) -> List[str]:
|
|
23
|
+
"""Split a comma-separated model list and strip whitespace."""
|
|
24
|
+
return [m.strip() for m in models_str.split(",") if m.strip()]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _load_prompt(prompt_text: Optional[str], prompt_file: Optional[str]) -> str:
|
|
28
|
+
"""Return the prompt string from either the positional arg or a file."""
|
|
29
|
+
if prompt_file:
|
|
30
|
+
p = Path(prompt_file)
|
|
31
|
+
if not p.exists():
|
|
32
|
+
console.print(f"[red]Error:[/red] prompt file '{prompt_file}' not found.")
|
|
33
|
+
sys.exit(1)
|
|
34
|
+
return p.read_text(encoding="utf-8").strip()
|
|
35
|
+
if prompt_text:
|
|
36
|
+
return prompt_text.strip()
|
|
37
|
+
console.print(
|
|
38
|
+
"[red]Error:[/red] You must supply a prompt as an argument or via --prompt."
|
|
39
|
+
)
|
|
40
|
+
console.print("Usage: model-diff \"Your prompt here\"")
|
|
41
|
+
console.print(" model-diff --prompt prompt.txt")
|
|
42
|
+
sys.exit(1)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
# Click command
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
@click.command(context_settings={"help_option_names": ["-h", "--help"]})
|
|
50
|
+
@click.argument("prompt_text", required=False, metavar="PROMPT")
|
|
51
|
+
@click.option(
|
|
52
|
+
"--prompt",
|
|
53
|
+
"prompt_file",
|
|
54
|
+
default=None,
|
|
55
|
+
metavar="FILE",
|
|
56
|
+
help="Path to a text file containing the prompt (alternative to inline PROMPT).",
|
|
57
|
+
)
|
|
58
|
+
@click.option(
|
|
59
|
+
"--models",
|
|
60
|
+
"models_str",
|
|
61
|
+
default=",".join(DEFAULT_MODELS),
|
|
62
|
+
show_default=True,
|
|
63
|
+
metavar="MODEL1,MODEL2,...",
|
|
64
|
+
help=(
|
|
65
|
+
"Comma-separated list of model IDs to compare. "
|
|
66
|
+
f"Supported: {', '.join(sorted(ALL_SUPPORTED_MODELS))}"
|
|
67
|
+
),
|
|
68
|
+
)
|
|
69
|
+
@click.option(
|
|
70
|
+
"--diff",
|
|
71
|
+
"diff_mode",
|
|
72
|
+
default="lines",
|
|
73
|
+
type=click.Choice(["lines", "words", "chars"], case_sensitive=False),
|
|
74
|
+
show_default=True,
|
|
75
|
+
help="Granularity of the diff: lines, words, or chars.",
|
|
76
|
+
)
|
|
77
|
+
@click.option(
|
|
78
|
+
"--only-diff",
|
|
79
|
+
is_flag=True,
|
|
80
|
+
default=False,
|
|
81
|
+
help="Hide matching sections and show only the differing parts.",
|
|
82
|
+
)
|
|
83
|
+
@click.option(
|
|
84
|
+
"--output",
|
|
85
|
+
"output_file",
|
|
86
|
+
default=None,
|
|
87
|
+
metavar="FILE",
|
|
88
|
+
help="Save the full results as JSON to this file.",
|
|
89
|
+
)
|
|
90
|
+
@click.option(
|
|
91
|
+
"--temperature",
|
|
92
|
+
default=0.7,
|
|
93
|
+
show_default=True,
|
|
94
|
+
type=click.FloatRange(0.0, 2.0),
|
|
95
|
+
help="Sampling temperature passed to each model (0.0 = deterministic).",
|
|
96
|
+
)
|
|
97
|
+
@click.version_option(package_name="model-diff")
|
|
98
|
+
def main(
|
|
99
|
+
prompt_text: Optional[str],
|
|
100
|
+
prompt_file: Optional[str],
|
|
101
|
+
models_str: str,
|
|
102
|
+
diff_mode: str,
|
|
103
|
+
only_diff: bool,
|
|
104
|
+
output_file: Optional[str],
|
|
105
|
+
temperature: float,
|
|
106
|
+
) -> None:
|
|
107
|
+
"""
|
|
108
|
+
Run the same prompt on multiple LLM models and show a side-by-side diff.
|
|
109
|
+
|
|
110
|
+
\b
|
|
111
|
+
Examples:
|
|
112
|
+
model-diff "What is the best way to handle errors in Python?"
|
|
113
|
+
model-diff "Explain recursion" --models gpt-4o,claude-sonnet-4-6
|
|
114
|
+
model-diff --prompt prompt.txt --models gpt-4o-mini,claude-haiku-4-5-20251001
|
|
115
|
+
model-diff "Explain recursion" --diff words --only-diff
|
|
116
|
+
model-diff "Explain recursion" --output results.json --temperature 0.0
|
|
117
|
+
"""
|
|
118
|
+
prompt = _load_prompt(prompt_text, prompt_file)
|
|
119
|
+
|
|
120
|
+
requested_models = _parse_models(models_str)
|
|
121
|
+
|
|
122
|
+
# Validate model names
|
|
123
|
+
valid_models, warnings = ModelRunner.validate_models(requested_models)
|
|
124
|
+
for w in warnings:
|
|
125
|
+
console.print(f"[yellow]Warning:[/yellow] {w}")
|
|
126
|
+
|
|
127
|
+
if not valid_models:
|
|
128
|
+
console.print(
|
|
129
|
+
"[red]Error:[/red] No valid models to run. "
|
|
130
|
+
f"Supported: {', '.join(sorted(ALL_SUPPORTED_MODELS))}"
|
|
131
|
+
)
|
|
132
|
+
sys.exit(1)
|
|
133
|
+
|
|
134
|
+
# ── Run the models ────────────────────────────────────────────────
|
|
135
|
+
console.print(
|
|
136
|
+
f"\n[dim]Running prompt on {len(valid_models)} model(s): "
|
|
137
|
+
f"{', '.join(valid_models)} (temperature={temperature})[/dim]\n"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
runner = ModelRunner(models=valid_models, temperature=temperature)
|
|
141
|
+
results = runner.run(prompt)
|
|
142
|
+
|
|
143
|
+
# ── Render ────────────────────────────────────────────────────────
|
|
144
|
+
engine = DiffEngine(diff_mode=diff_mode, only_diff=only_diff, console=console)
|
|
145
|
+
engine.render(
|
|
146
|
+
prompt=prompt,
|
|
147
|
+
results=results,
|
|
148
|
+
temperature=temperature,
|
|
149
|
+
output_file=output_file,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Exit with non-zero if every model failed
|
|
153
|
+
if all(r.error for r in results):
|
|
154
|
+
sys.exit(2)
|