lefx 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lefx-0.3.0/.env.example +8 -0
- lefx-0.3.0/.github/dependabot.yml +11 -0
- lefx-0.3.0/.github/workflows/ci.yml +136 -0
- lefx-0.3.0/.github/workflows/release.yml +179 -0
- lefx-0.3.0/.gitignore +31 -0
- lefx-0.3.0/CHANGELOG.md +75 -0
- lefx-0.3.0/CLAUDE.md +101 -0
- lefx-0.3.0/LICENSE +21 -0
- lefx-0.3.0/MANIFEST.in +5 -0
- lefx-0.3.0/PKG-INFO +1336 -0
- lefx-0.3.0/README.md +1289 -0
- lefx-0.3.0/docs/walkthrough.md +749 -0
- lefx-0.3.0/examples/custom_scorer.py +140 -0
- lefx-0.3.0/examples/dataset_eval.py +100 -0
- lefx-0.3.0/examples/eval_suite.yaml +43 -0
- lefx-0.3.0/examples/langgraph_eval.py +121 -0
- lefx-0.3.0/examples/llm_as_judge.py +69 -0
- lefx-0.3.0/examples/online_eval.py +71 -0
- lefx-0.3.0/examples/quickstart.py +55 -0
- lefx-0.3.0/examples/test_data.yaml +17 -0
- lefx-0.3.0/pyproject.toml +96 -0
- lefx-0.3.0/src/lef/__init__.py +256 -0
- lefx-0.3.0/src/lef/assertions.py +148 -0
- lefx-0.3.0/src/lef/cache.py +146 -0
- lefx-0.3.0/src/lef/ci/__init__.py +13 -0
- lefx-0.3.0/src/lef/ci/azuredevops.py +211 -0
- lefx-0.3.0/src/lef/ci/github.py +183 -0
- lefx-0.3.0/src/lef/cli.py +1086 -0
- lefx-0.3.0/src/lef/compare.py +374 -0
- lefx-0.3.0/src/lef/config.py +94 -0
- lefx-0.3.0/src/lef/core/__init__.py +23 -0
- lefx-0.3.0/src/lef/core/base.py +115 -0
- lefx-0.3.0/src/lef/core/decorators.py +147 -0
- lefx-0.3.0/src/lef/core/types.py +113 -0
- lefx-0.3.0/src/lef/datasets/__init__.py +19 -0
- lefx-0.3.0/src/lef/datasets/loader.py +135 -0
- lefx-0.3.0/src/lef/datasets/runner.py +360 -0
- lefx-0.3.0/src/lef/git_context.py +185 -0
- lefx-0.3.0/src/lef/integrations/__init__.py +31 -0
- lefx-0.3.0/src/lef/integrations/langchain.py +162 -0
- lefx-0.3.0/src/lef/integrations/langgraph.py +143 -0
- lefx-0.3.0/src/lef/integrations/remote.py +206 -0
- lefx-0.3.0/src/lef/judges/__init__.py +43 -0
- lefx-0.3.0/src/lef/judges/llm.py +179 -0
- lefx-0.3.0/src/lef/judges/prompts.py +95 -0
- lefx-0.3.0/src/lef/judges/trajectory.py +87 -0
- lefx-0.3.0/src/lef/monitor.py +223 -0
- lefx-0.3.0/src/lef/online/__init__.py +15 -0
- lefx-0.3.0/src/lef/online/tracing.py +251 -0
- lefx-0.3.0/src/lef/output.py +285 -0
- lefx-0.3.0/src/lef/py.typed +0 -0
- lefx-0.3.0/src/lef/pytest_plugin.py +204 -0
- lefx-0.3.0/src/lef/redteam.py +292 -0
- lefx-0.3.0/src/lef/scorers/__init__.py +22 -0
- lefx-0.3.0/src/lef/scorers/builtin.py +241 -0
- lefx-0.3.0/src/lef/scorers/custom.py +110 -0
- lefx-0.3.0/src/lef/synthetic.py +373 -0
- lefx-0.3.0/src/lef/watch.py +121 -0
- lefx-0.3.0/tests/__init__.py +0 -0
- lefx-0.3.0/tests/conftest.py +28 -0
- lefx-0.3.0/tests/test_assertions.py +137 -0
- lefx-0.3.0/tests/test_cli.py +118 -0
- lefx-0.3.0/tests/test_cli_extended.py +185 -0
- lefx-0.3.0/tests/test_config.py +36 -0
- lefx-0.3.0/tests/test_config_integration.py +149 -0
- lefx-0.3.0/tests/test_core.py +164 -0
- lefx-0.3.0/tests/test_integrations.py +142 -0
- lefx-0.3.0/tests/test_judges.py +145 -0
- lefx-0.3.0/tests/test_loader.py +153 -0
- lefx-0.3.0/tests/test_new_modules.py +954 -0
- lefx-0.3.0/tests/test_new_modules_2.py +597 -0
- lefx-0.3.0/tests/test_online.py +220 -0
- lefx-0.3.0/tests/test_online_extended.py +189 -0
- lefx-0.3.0/tests/test_remote.py +124 -0
- lefx-0.3.0/tests/test_runner.py +316 -0
- lefx-0.3.0/tests/test_scorers.py +186 -0
- lefx-0.3.0/tests/test_summary_evaluators.py +171 -0
- lefx-0.3.0/tests/test_trajectory.py +32 -0
lefx-0.3.0/.env.example
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
# Cancel in-progress runs for the same branch/PR
|
|
10
|
+
concurrency:
|
|
11
|
+
group: ci-${{ github.ref }}
|
|
12
|
+
cancel-in-progress: true
|
|
13
|
+
|
|
14
|
+
# Restrict default permissions (public repo hardening)
|
|
15
|
+
permissions:
|
|
16
|
+
contents: read
|
|
17
|
+
|
|
18
|
+
jobs:
|
|
19
|
+
lint:
|
|
20
|
+
name: Lint
|
|
21
|
+
runs-on: [self-hosted, Linux]
|
|
22
|
+
steps:
|
|
23
|
+
- uses: actions/checkout@v4
|
|
24
|
+
- uses: astral-sh/setup-uv@v4
|
|
25
|
+
with:
|
|
26
|
+
version: "latest"
|
|
27
|
+
- name: Install dependencies
|
|
28
|
+
run: uv sync --extra dev
|
|
29
|
+
- name: Ruff check
|
|
30
|
+
run: uv run ruff check src/ tests/
|
|
31
|
+
- name: Ruff format check
|
|
32
|
+
run: uv run ruff format --check src/ tests/
|
|
33
|
+
|
|
34
|
+
test-linux:
|
|
35
|
+
name: Test (Linux, Python ${{ matrix.python-version }})
|
|
36
|
+
runs-on: [self-hosted, Linux]
|
|
37
|
+
strategy:
|
|
38
|
+
fail-fast: false
|
|
39
|
+
matrix:
|
|
40
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
41
|
+
steps:
|
|
42
|
+
- uses: actions/checkout@v4
|
|
43
|
+
- uses: astral-sh/setup-uv@v4
|
|
44
|
+
with:
|
|
45
|
+
version: "latest"
|
|
46
|
+
- name: Set Python version
|
|
47
|
+
run: uv python install ${{ matrix.python-version }}
|
|
48
|
+
- name: Install dependencies
|
|
49
|
+
run: uv sync --python ${{ matrix.python-version }} --extra dev --extra all
|
|
50
|
+
- name: Run tests
|
|
51
|
+
run: uv run pytest -v --tb=short
|
|
52
|
+
- name: Verify CLI entry point
|
|
53
|
+
run: uv run lef --help
|
|
54
|
+
- name: Verify package import
|
|
55
|
+
run: uv run python -c "import lef; print(f'LEF {lef.__version__} — {len(lef.__all__)} exports')"
|
|
56
|
+
|
|
57
|
+
test-windows:
|
|
58
|
+
name: Test (Windows, Python ${{ matrix.python-version }})
|
|
59
|
+
runs-on: [self-hosted, Windows]
|
|
60
|
+
strategy:
|
|
61
|
+
fail-fast: false
|
|
62
|
+
matrix:
|
|
63
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
64
|
+
steps:
|
|
65
|
+
- uses: actions/checkout@v4
|
|
66
|
+
- uses: astral-sh/setup-uv@v4
|
|
67
|
+
with:
|
|
68
|
+
version: "latest"
|
|
69
|
+
- name: Set Python version
|
|
70
|
+
run: uv python install ${{ matrix.python-version }}
|
|
71
|
+
- name: Install dependencies
|
|
72
|
+
run: uv sync --python ${{ matrix.python-version }} --extra dev --extra all
|
|
73
|
+
- name: Run tests
|
|
74
|
+
run: uv run pytest -v --tb=short
|
|
75
|
+
- name: Verify CLI entry point
|
|
76
|
+
run: uv run lef --help
|
|
77
|
+
- name: Verify package import
|
|
78
|
+
run: uv run python -c "import lef; print(f'LEF {lef.__version__} — {len(lef.__all__)} exports')"
|
|
79
|
+
|
|
80
|
+
# test-macos:
|
|
81
|
+
# name: Test (macOS, Python ${{ matrix.python-version }})
|
|
82
|
+
# runs-on: [self-hosted, macOS]
|
|
83
|
+
# strategy:
|
|
84
|
+
# fail-fast: false
|
|
85
|
+
# matrix:
|
|
86
|
+
# python-version: ["3.11", "3.12", "3.13"]
|
|
87
|
+
# steps:
|
|
88
|
+
# - uses: actions/checkout@v4
|
|
89
|
+
# - uses: astral-sh/setup-uv@v4
|
|
90
|
+
# with:
|
|
91
|
+
# version: "latest"
|
|
92
|
+
# - name: Set Python version
|
|
93
|
+
# run: uv python install ${{ matrix.python-version }}
|
|
94
|
+
# - name: Install dependencies
|
|
95
|
+
# run: uv sync --python ${{ matrix.python-version }} --extra dev --extra all
|
|
96
|
+
# - name: Run tests
|
|
97
|
+
# run: uv run pytest -v --tb=short
|
|
98
|
+
# - name: Verify CLI entry point
|
|
99
|
+
# run: uv run lef --help
|
|
100
|
+
# - name: Verify package import
|
|
101
|
+
# run: uv run python -c "import lef; print(f'LEF {lef.__version__} — {len(lef.__all__)} exports')"
|
|
102
|
+
|
|
103
|
+
typecheck:
|
|
104
|
+
name: Type Check
|
|
105
|
+
runs-on: [self-hosted, Linux]
|
|
106
|
+
steps:
|
|
107
|
+
- uses: actions/checkout@v4
|
|
108
|
+
- uses: astral-sh/setup-uv@v4
|
|
109
|
+
with:
|
|
110
|
+
version: "latest"
|
|
111
|
+
- name: Install dependencies
|
|
112
|
+
run: uv sync --extra dev --extra all
|
|
113
|
+
- name: MyPy
|
|
114
|
+
run: uv run mypy src/lef/ --ignore-missing-imports
|
|
115
|
+
|
|
116
|
+
build:
|
|
117
|
+
name: Build Package
|
|
118
|
+
runs-on: [self-hosted, Linux]
|
|
119
|
+
needs: [lint, test-linux]
|
|
120
|
+
steps:
|
|
121
|
+
- uses: actions/checkout@v4
|
|
122
|
+
- uses: astral-sh/setup-uv@v4
|
|
123
|
+
with:
|
|
124
|
+
version: "latest"
|
|
125
|
+
- name: Build wheel and sdist
|
|
126
|
+
run: uv build
|
|
127
|
+
- name: Verify wheel installs
|
|
128
|
+
run: |
|
|
129
|
+
uv venv --clear /tmp/lef-install-test
|
|
130
|
+
uv pip install --python /tmp/lef-install-test/bin/python dist/*.whl
|
|
131
|
+
/tmp/lef-install-test/bin/python -c "import lef; print(f'LEF {lef.__version__}')"
|
|
132
|
+
- uses: actions/upload-artifact@v4
|
|
133
|
+
with:
|
|
134
|
+
name: dist
|
|
135
|
+
path: dist/
|
|
136
|
+
retention-days: 30
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
# Restrict default permissions (public repo hardening)
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
validate:
|
|
14
|
+
name: Validate Tag
|
|
15
|
+
runs-on: [self-hosted, Linux]
|
|
16
|
+
outputs:
|
|
17
|
+
version: ${{ steps.version.outputs.version }}
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v4
|
|
20
|
+
- name: Extract version from tag
|
|
21
|
+
id: version
|
|
22
|
+
run: |
|
|
23
|
+
TAG="${GITHUB_REF#refs/tags/v}"
|
|
24
|
+
echo "version=$TAG" >> "$GITHUB_OUTPUT"
|
|
25
|
+
echo "Release version: $TAG"
|
|
26
|
+
- uses: astral-sh/setup-uv@v4
|
|
27
|
+
with:
|
|
28
|
+
version: "latest"
|
|
29
|
+
- name: Verify tag matches pyproject.toml version
|
|
30
|
+
run: |
|
|
31
|
+
PKG_VERSION=$(uv run python -c "import lef; print(lef.__version__)")
|
|
32
|
+
TAG_VERSION="${GITHUB_REF#refs/tags/v}"
|
|
33
|
+
if [ "$PKG_VERSION" != "$TAG_VERSION" ]; then
|
|
34
|
+
echo "ERROR: Tag version ($TAG_VERSION) does not match package version ($PKG_VERSION)"
|
|
35
|
+
exit 1
|
|
36
|
+
fi
|
|
37
|
+
echo "Version match confirmed: $PKG_VERSION"
|
|
38
|
+
|
|
39
|
+
test-linux:
|
|
40
|
+
name: Test (Linux, Python ${{ matrix.python-version }})
|
|
41
|
+
runs-on: [self-hosted, Linux]
|
|
42
|
+
needs: [validate]
|
|
43
|
+
strategy:
|
|
44
|
+
fail-fast: false
|
|
45
|
+
matrix:
|
|
46
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
47
|
+
steps:
|
|
48
|
+
- uses: actions/checkout@v4
|
|
49
|
+
- uses: astral-sh/setup-uv@v4
|
|
50
|
+
with:
|
|
51
|
+
version: "latest"
|
|
52
|
+
- name: Set Python version
|
|
53
|
+
run: uv python install ${{ matrix.python-version }}
|
|
54
|
+
- name: Install dependencies
|
|
55
|
+
run: uv sync --python ${{ matrix.python-version }} --extra dev --extra all
|
|
56
|
+
- name: Run tests
|
|
57
|
+
run: uv run pytest -v --tb=short
|
|
58
|
+
|
|
59
|
+
test-windows:
|
|
60
|
+
name: Test (Windows, Python ${{ matrix.python-version }})
|
|
61
|
+
runs-on: [self-hosted, Windows]
|
|
62
|
+
needs: [validate]
|
|
63
|
+
strategy:
|
|
64
|
+
fail-fast: false
|
|
65
|
+
matrix:
|
|
66
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
67
|
+
steps:
|
|
68
|
+
- uses: actions/checkout@v4
|
|
69
|
+
- uses: astral-sh/setup-uv@v4
|
|
70
|
+
with:
|
|
71
|
+
version: "latest"
|
|
72
|
+
- name: Set Python version
|
|
73
|
+
run: uv python install ${{ matrix.python-version }}
|
|
74
|
+
- name: Install dependencies
|
|
75
|
+
run: uv sync --python ${{ matrix.python-version }} --extra dev --extra all
|
|
76
|
+
- name: Run tests
|
|
77
|
+
run: uv run pytest -v --tb=short
|
|
78
|
+
|
|
79
|
+
# test-macos:
|
|
80
|
+
# name: Test (macOS, Python ${{ matrix.python-version }})
|
|
81
|
+
# runs-on: [self-hosted, macOS]
|
|
82
|
+
# needs: [validate]
|
|
83
|
+
# strategy:
|
|
84
|
+
# fail-fast: false
|
|
85
|
+
# matrix:
|
|
86
|
+
# python-version: ["3.11", "3.12", "3.13"]
|
|
87
|
+
# steps:
|
|
88
|
+
# - uses: actions/checkout@v4
|
|
89
|
+
# - uses: astral-sh/setup-uv@v4
|
|
90
|
+
# with:
|
|
91
|
+
# version: "latest"
|
|
92
|
+
# - name: Set Python version
|
|
93
|
+
# run: uv python install ${{ matrix.python-version }}
|
|
94
|
+
# - name: Install dependencies
|
|
95
|
+
# run: uv sync --python ${{ matrix.python-version }} --extra dev --extra all
|
|
96
|
+
# - name: Run tests
|
|
97
|
+
# run: uv run pytest -v --tb=short
|
|
98
|
+
|
|
99
|
+
build:
|
|
100
|
+
name: Build Package
|
|
101
|
+
runs-on: [self-hosted, Linux]
|
|
102
|
+
needs: [test-linux, test-windows]
|
|
103
|
+
# needs: [test-linux, test-windows, test-macos] # Uncomment when macOS is enabled
|
|
104
|
+
permissions:
|
|
105
|
+
contents: read
|
|
106
|
+
steps:
|
|
107
|
+
- uses: actions/checkout@v4
|
|
108
|
+
- uses: astral-sh/setup-uv@v4
|
|
109
|
+
with:
|
|
110
|
+
version: "latest"
|
|
111
|
+
- name: Build wheel and sdist
|
|
112
|
+
run: uv build
|
|
113
|
+
- name: Verify wheel installs cleanly
|
|
114
|
+
run: |
|
|
115
|
+
uv venv --clear /tmp/lef-release-test
|
|
116
|
+
uv pip install --python /tmp/lef-release-test/bin/python dist/*.whl
|
|
117
|
+
/tmp/lef-release-test/bin/python -c "import lef; print(f'LEF {lef.__version__}')"
|
|
118
|
+
- uses: actions/upload-artifact@v4
|
|
119
|
+
with:
|
|
120
|
+
name: dist
|
|
121
|
+
path: dist/
|
|
122
|
+
retention-days: 90
|
|
123
|
+
|
|
124
|
+
publish-pypi:
|
|
125
|
+
name: Publish to PyPI
|
|
126
|
+
runs-on: [self-hosted, Linux]
|
|
127
|
+
needs: [build]
|
|
128
|
+
permissions:
|
|
129
|
+
contents: read
|
|
130
|
+
id-token: write # Required for trusted publishing
|
|
131
|
+
environment:
|
|
132
|
+
name: pypi
|
|
133
|
+
url: https://pypi.org/project/lefx/
|
|
134
|
+
steps:
|
|
135
|
+
- uses: actions/download-artifact@v4
|
|
136
|
+
with:
|
|
137
|
+
name: dist
|
|
138
|
+
path: dist/
|
|
139
|
+
- name: Publish to PyPI
|
|
140
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
141
|
+
# Uses trusted publishing (OIDC) — no API token needed
|
|
142
|
+
# Requires PyPI project to be configured for GitHub OIDC
|
|
143
|
+
# See: https://docs.pypi.org/trusted-publishers/
|
|
144
|
+
|
|
145
|
+
github-release:
|
|
146
|
+
name: GitHub Release
|
|
147
|
+
runs-on: [self-hosted, Linux]
|
|
148
|
+
needs: [build]
|
|
149
|
+
permissions:
|
|
150
|
+
contents: write # Required to create releases
|
|
151
|
+
steps:
|
|
152
|
+
- uses: actions/checkout@v4
|
|
153
|
+
- uses: actions/download-artifact@v4
|
|
154
|
+
with:
|
|
155
|
+
name: dist
|
|
156
|
+
path: dist/
|
|
157
|
+
- name: Create GitHub Release
|
|
158
|
+
uses: softprops/action-gh-release@v2
|
|
159
|
+
with:
|
|
160
|
+
files: dist/*
|
|
161
|
+
generate_release_notes: true
|
|
162
|
+
name: "LEF ${{ github.ref_name }}"
|
|
163
|
+
|
|
164
|
+
# publish-macos:
|
|
165
|
+
# # If you ever need macOS-specific wheels (e.g. for native extensions)
|
|
166
|
+
# name: Build macOS Wheel
|
|
167
|
+
# runs-on: [self-hosted, macOS]
|
|
168
|
+
# needs: [test-macos]
|
|
169
|
+
# steps:
|
|
170
|
+
# - uses: actions/checkout@v4
|
|
171
|
+
# - uses: astral-sh/setup-uv@v4
|
|
172
|
+
# with:
|
|
173
|
+
# version: "latest"
|
|
174
|
+
# - name: Build
|
|
175
|
+
# run: uv build
|
|
176
|
+
# - uses: actions/upload-artifact@v4
|
|
177
|
+
# with:
|
|
178
|
+
# name: dist-macos
|
|
179
|
+
# path: dist/
|
lefx-0.3.0/.gitignore
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*$py.class
|
|
4
|
+
*.so
|
|
5
|
+
.lef/
|
|
6
|
+
dist/
|
|
7
|
+
build/
|
|
8
|
+
*.egg-info/
|
|
9
|
+
*.egg
|
|
10
|
+
.eggs/
|
|
11
|
+
*.whl
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
env/
|
|
15
|
+
.env
|
|
16
|
+
.env.*
|
|
17
|
+
!.env.example
|
|
18
|
+
*.log
|
|
19
|
+
.mypy_cache/
|
|
20
|
+
.ruff_cache/
|
|
21
|
+
.pytest_cache/
|
|
22
|
+
htmlcov/
|
|
23
|
+
.coverage
|
|
24
|
+
.coverage.*
|
|
25
|
+
coverage.xml
|
|
26
|
+
*.cover
|
|
27
|
+
.hypothesis/
|
|
28
|
+
.tox/
|
|
29
|
+
.nox/
|
|
30
|
+
.uv/
|
|
31
|
+
uv.lock
|
lefx-0.3.0/CHANGELOG.md
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to LEF will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.3.0] - 2026-03-30
|
|
9
|
+
|
|
10
|
+
### Changed
|
|
11
|
+
- Renamed PyPI package from `lef` to `lefx` (import remains `import lef`)
|
|
12
|
+
|
|
13
|
+
## [0.2.0] - 2026-03-28
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
- **Result export**: JSON, CSV, and JUnit XML export via `--output` flag and `export_results()` API
|
|
17
|
+
- **Rich terminal output**: `format_results_table()` with per-metric summary, min/max/avg, and threshold pass/fail
|
|
18
|
+
- **Git-aware metadata**: Auto-injects branch, commit SHA, author into experiment metadata; detects CI environments (GitHub Actions, Azure DevOps, GitLab CI, Jenkins)
|
|
19
|
+
- **Baseline management**: `save_baseline()`, `load_baseline()`, `compare_results()` for regression detection; `lef baseline list/delete` CLI
|
|
20
|
+
- **Experiment comparison**: `ComparisonReport` with table/markdown output and `compare_experiments()` for LangSmith experiments
|
|
21
|
+
- **Result caching**: `ResultCache` with content-addressable hashing and TTL for avoiding re-invocation of expensive targets
|
|
22
|
+
- **Watch mode**: `--watch` flag for auto-rerun on file changes during iterative development
|
|
23
|
+
- **CI/CD PR comments**: `post_github_comment()` and `post_azdo_comment()` for posting results to GitHub and Azure DevOps PRs
|
|
24
|
+
- **QA endpoint testing**: `lef qa` subcommand for testing deployed HTTP endpoints against datasets
|
|
25
|
+
- **pytest plugin**: `lef_eval` fixture and config-file collection via `pytest11` entry point
|
|
26
|
+
- **Synthetic data generation**: `generate_from_docs()`, `generate_from_traces()`, `generate_adversarial()`, `diversify_dataset()`
|
|
27
|
+
- **Production monitoring**: `MonitorDaemon` for continuous evaluation of LangSmith runs with threshold alerting
|
|
28
|
+
- **Red-team testing**: `run_redteam()` with 6 attack categories (prompt injection, jailbreak, PII extraction, hallucination inducement, toxicity, bias) and 3 built-in safety scorers
|
|
29
|
+
- **Remote targets**: HTTP URLs supported as targets in config files via `create_remote_target()`
|
|
30
|
+
- **File path targets**: Target functions can be referenced by file path (`/path/to/file.py:function`)
|
|
31
|
+
- **Multi-config composition**: Pass multiple YAML configs to merge evaluators, thresholds, and metadata
|
|
32
|
+
- **CLI subcommands**: `compare`, `baseline`, `qa`, `monitor`, `redteam`, `dataset` (pull/push/diff/generate)
|
|
33
|
+
- **`--version` flag**: `lef -v` / `lef --version` shows version
|
|
34
|
+
- `py.typed` marker for PEP 561 type information
|
|
35
|
+
- 84 public API exports (up from 52)
|
|
36
|
+
|
|
37
|
+
### Fixed
|
|
38
|
+
- Outputs kwarg override bug in dual-signature resolution across decorators and base classes
|
|
39
|
+
- `pass_rate` treating missing feedback as passing (now correctly treats as failing)
|
|
40
|
+
- Client reuse in `online/tracing.py` (singleton default client avoids per-call overhead)
|
|
41
|
+
- GitHub API URL for comment updates (was missing `/issues/` path segment)
|
|
42
|
+
- Monitor `_seen_run_ids` used unordered set for eviction (now uses OrderedDict)
|
|
43
|
+
- `compare_results()` now handles baseline dict format for both parameters
|
|
44
|
+
- `_load_config()` rejects non-dict YAML (empty files, arrays) with clear error
|
|
45
|
+
- Malformed config files produce clean error messages instead of raw tracebacks
|
|
46
|
+
- Duplicate `-v`/`--verbose` flag between parent and run subparser
|
|
47
|
+
- `lef` with no arguments now exits with code 2 (was incorrectly 0)
|
|
48
|
+
|
|
49
|
+
### Changed
|
|
50
|
+
- Version bumped to 0.2.0
|
|
51
|
+
- Development Status classifier changed from Alpha to Beta
|
|
52
|
+
- `-v` flag changed from `--verbose` to `--version` (use `--verbose` for debug logging)
|
|
53
|
+
- `pyproject.toml` extras now use self-references to avoid duplicate version pins
|
|
54
|
+
|
|
55
|
+
## [0.1.0] - 2024-12-01
|
|
56
|
+
|
|
57
|
+
### Added
|
|
58
|
+
- Core evaluation framework with `EvalResult`, `EvalResultBatch`, and `JudgeModel` types
|
|
59
|
+
- `@scorer` and `@evaluator` decorators for creating LangSmith-compatible evaluators
|
|
60
|
+
- Built-in scorers: `exact_match`, `contains`, `regex_match`, `json_match`
|
|
61
|
+
- Custom scorer factories: `create_scorer`, `create_composite_scorer`
|
|
62
|
+
- 14 pre-built LLM-as-Judge evaluators (correctness, safety, hallucination, RAG, etc.)
|
|
63
|
+
- Custom judge creation via `create_judge()` with openevals integration
|
|
64
|
+
- Agent trajectory evaluation via `create_trajectory_evaluator` and `create_trajectory_judge`
|
|
65
|
+
- Dataset runner: `run_eval`, `arun_eval`, `EvalRunner`, `create_dataset`
|
|
66
|
+
- Local dataset loading from YAML, JSON, and CSV files
|
|
67
|
+
- Online evaluation: `evaluate_run`, `OnlineEvaluator`, `create_rule`
|
|
68
|
+
- LangChain integration: `evaluate_chain`, `create_chain_target` (sync + async)
|
|
69
|
+
- LangGraph integration: `evaluate_graph`, `create_graph_target` (sync + async)
|
|
70
|
+
- CI/CD gating with `assert_scores` and `check_scores`
|
|
71
|
+
- CLI tool (`lef run`) for running evaluation suites from config files
|
|
72
|
+
- `LefConfig` for environment-based configuration
|
|
73
|
+
- Dual-signature compatibility (legacy `run, example` and new-style kwargs)
|
|
74
|
+
- 222 tests with full mocking (no API calls required)
|
|
75
|
+
- Comprehensive README with quick-start examples and API reference
|
lefx-0.3.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## What is this project?
|
|
6
|
+
|
|
7
|
+
LEF (LangSmith Evaluation Framework) is a plug-and-play evaluation system for LangChain, LangGraph, and LangSmith projects. It wraps `langsmith`, `openevals`, and `agentevals` into a unified framework with built-in QA/CI support.
|
|
8
|
+
|
|
9
|
+
## Common Commands
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
# Install dependencies
|
|
13
|
+
uv sync --extra dev --extra all
|
|
14
|
+
|
|
15
|
+
# Run all tests
|
|
16
|
+
uv run pytest
|
|
17
|
+
|
|
18
|
+
# Run a single test file
|
|
19
|
+
uv run pytest tests/test_scorers.py
|
|
20
|
+
|
|
21
|
+
# Run a single test by name
|
|
22
|
+
uv run pytest tests/test_scorers.py -k "test_exact_match"
|
|
23
|
+
|
|
24
|
+
# Lint
|
|
25
|
+
uv run ruff check src/ tests/
|
|
26
|
+
|
|
27
|
+
# Lint with auto-fix
|
|
28
|
+
uv run ruff check --fix src/ tests/
|
|
29
|
+
|
|
30
|
+
# Type check
|
|
31
|
+
uv run mypy src/
|
|
32
|
+
|
|
33
|
+
# Run the CLI
|
|
34
|
+
uv run lef run <config.yaml>
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Architecture
|
|
38
|
+
|
|
39
|
+
Source lives in `src/lef/`. All public API is exported from `src/lef/__init__.py` via `__all__`.
|
|
40
|
+
|
|
41
|
+
### Key design decisions
|
|
42
|
+
|
|
43
|
+
- **EvalResult is a `dict` subclass** (not a Pydantic model) — this is required because LangSmith's `evaluate()` expects dict-like results. It also provides property access (`.key`, `.score`, `.comment`, `.metadata`).
|
|
44
|
+
- **Dual-signature compatibility** — all evaluators accept both LangSmith's legacy `(run, example)` and new-style `(inputs=, outputs=, reference_outputs=)` keyword signatures. The `@scorer` decorator in `core/decorators.py` handles this normalization.
|
|
45
|
+
- **Decorators normalize return types** — `@scorer` accepts `bool`, `float`, `int`, `dict`, or `EvalResult` returns and always produces a LangSmith-compatible dict.
|
|
46
|
+
- **openevals judges are thin wrappers** — `judges/llm.py` factory functions (e.g., `correctness_judge()`) wrap `openevals.create_llm_as_judge()` with LEF defaults and return callables.
|
|
47
|
+
- **Local datasets don't need LangSmith** — `load_examples()` from `datasets/loader.py` supports YAML/JSON/CSV. Combined with `upload_results=False`, evaluation runs fully offline.
|
|
48
|
+
|
|
49
|
+
### Module layout
|
|
50
|
+
|
|
51
|
+
**Core:**
|
|
52
|
+
- `core/types.py` — `EvalResult`, `EvalResultBatch`, `JudgeModel` enum, evaluator protocols
|
|
53
|
+
- `core/base.py` — `BaseEvaluator`, `AsyncBaseEvaluator` abstract classes
|
|
54
|
+
- `core/decorators.py` — `@scorer` and `@evaluator` decorators
|
|
55
|
+
|
|
56
|
+
**Scorers & Judges:**
|
|
57
|
+
- `scorers/builtin.py` — Rule-based scorers: `exact_match`, `contains`, `regex_match`, `json_match`, `mean_score`, `pass_rate`
|
|
58
|
+
- `scorers/custom.py` — `create_scorer`, `create_composite_scorer` factories
|
|
59
|
+
- `judges/llm.py` — 20+ LLM-as-judge factories (correctness, safety, hallucination, RAG, etc.)
|
|
60
|
+
- `judges/trajectory.py` — Agent trajectory evaluators wrapping `agentevals`
|
|
61
|
+
|
|
62
|
+
**Data & Runner:**
|
|
63
|
+
- `datasets/runner.py` — `run_eval`, `arun_eval`, `EvalRunner`, `create_dataset`
|
|
64
|
+
- `datasets/loader.py` — `load_examples` for local YAML/JSON/CSV files
|
|
65
|
+
|
|
66
|
+
**CLI:**
|
|
67
|
+
- `cli.py` — CLI entry point with 7 subcommands: `run`, `compare`, `baseline`, `qa`, `monitor`, `redteam`, `dataset`
|
|
68
|
+
|
|
69
|
+
**Integrations:**
|
|
70
|
+
- `integrations/langchain.py` — LangChain chain evaluation
|
|
71
|
+
- `integrations/langgraph.py` — LangGraph graph evaluation
|
|
72
|
+
- `integrations/remote.py` — HTTP endpoint targets (`create_remote_target`)
|
|
73
|
+
|
|
74
|
+
**v0.2.0 Modules:**
|
|
75
|
+
- `output.py` — Rich terminal tables + JSON/CSV/JUnit XML export
|
|
76
|
+
- `git_context.py` — Git-aware experiment metadata (branch, commit, CI detection)
|
|
77
|
+
- `compare.py` — Baseline management and experiment comparison
|
|
78
|
+
- `cache.py` — Content-addressable result caching with TTL
|
|
79
|
+
- `watch.py` — File-watching mode for iterative development
|
|
80
|
+
- `ci/github.py` — GitHub PR comment integration
|
|
81
|
+
- `ci/azuredevops.py` — Azure DevOps PR comment integration
|
|
82
|
+
- `pytest_plugin.py` — pytest plugin with `lef_eval` fixture (entry point: `pytest11`)
|
|
83
|
+
- `synthetic.py` — Synthetic dataset generation from documents/traces
|
|
84
|
+
- `monitor.py` — Production monitoring daemon with threshold alerting
|
|
85
|
+
- `redteam.py` — Adversarial red-team testing with 6 attack categories
|
|
86
|
+
- `online/tracing.py` — Online evaluation of production LangSmith runs
|
|
87
|
+
|
|
88
|
+
## Testing
|
|
89
|
+
|
|
90
|
+
- All tests in `tests/`, using pytest with `asyncio_mode = "auto"` (no need for `@pytest.mark.asyncio`)
|
|
91
|
+
- Tests mock LLM APIs via `unittest.mock.patch` — no real API calls needed
|
|
92
|
+
- Shared fixtures in `tests/conftest.py`: `sample_inputs`, `sample_outputs`, `sample_reference_outputs`, `wrong_outputs`, `empty_outputs`
|
|
93
|
+
|
|
94
|
+
## When Adding New Evaluators
|
|
95
|
+
|
|
96
|
+
1. Add the factory function to `src/lef/judges/llm.py` (for LLM judges) or `src/lef/scorers/builtin.py` (for rule-based)
|
|
97
|
+
2. Export from the sub-package `__init__.py`
|
|
98
|
+
3. Export from the top-level `src/lef/__init__.py` and add to `__all__`
|
|
99
|
+
4. Add to `_BUILTIN_EVALUATORS` in `src/lef/cli.py` for CLI support
|
|
100
|
+
5. Add tests in `tests/`
|
|
101
|
+
6. Update the README table
|
lefx-0.3.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 bogware
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|