lefx 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. lefx-0.3.0/.env.example +8 -0
  2. lefx-0.3.0/.github/dependabot.yml +11 -0
  3. lefx-0.3.0/.github/workflows/ci.yml +136 -0
  4. lefx-0.3.0/.github/workflows/release.yml +179 -0
  5. lefx-0.3.0/.gitignore +31 -0
  6. lefx-0.3.0/CHANGELOG.md +75 -0
  7. lefx-0.3.0/CLAUDE.md +101 -0
  8. lefx-0.3.0/LICENSE +21 -0
  9. lefx-0.3.0/MANIFEST.in +5 -0
  10. lefx-0.3.0/PKG-INFO +1336 -0
  11. lefx-0.3.0/README.md +1289 -0
  12. lefx-0.3.0/docs/walkthrough.md +749 -0
  13. lefx-0.3.0/examples/custom_scorer.py +140 -0
  14. lefx-0.3.0/examples/dataset_eval.py +100 -0
  15. lefx-0.3.0/examples/eval_suite.yaml +43 -0
  16. lefx-0.3.0/examples/langgraph_eval.py +121 -0
  17. lefx-0.3.0/examples/llm_as_judge.py +69 -0
  18. lefx-0.3.0/examples/online_eval.py +71 -0
  19. lefx-0.3.0/examples/quickstart.py +55 -0
  20. lefx-0.3.0/examples/test_data.yaml +17 -0
  21. lefx-0.3.0/pyproject.toml +96 -0
  22. lefx-0.3.0/src/lef/__init__.py +256 -0
  23. lefx-0.3.0/src/lef/assertions.py +148 -0
  24. lefx-0.3.0/src/lef/cache.py +146 -0
  25. lefx-0.3.0/src/lef/ci/__init__.py +13 -0
  26. lefx-0.3.0/src/lef/ci/azuredevops.py +211 -0
  27. lefx-0.3.0/src/lef/ci/github.py +183 -0
  28. lefx-0.3.0/src/lef/cli.py +1086 -0
  29. lefx-0.3.0/src/lef/compare.py +374 -0
  30. lefx-0.3.0/src/lef/config.py +94 -0
  31. lefx-0.3.0/src/lef/core/__init__.py +23 -0
  32. lefx-0.3.0/src/lef/core/base.py +115 -0
  33. lefx-0.3.0/src/lef/core/decorators.py +147 -0
  34. lefx-0.3.0/src/lef/core/types.py +113 -0
  35. lefx-0.3.0/src/lef/datasets/__init__.py +19 -0
  36. lefx-0.3.0/src/lef/datasets/loader.py +135 -0
  37. lefx-0.3.0/src/lef/datasets/runner.py +360 -0
  38. lefx-0.3.0/src/lef/git_context.py +185 -0
  39. lefx-0.3.0/src/lef/integrations/__init__.py +31 -0
  40. lefx-0.3.0/src/lef/integrations/langchain.py +162 -0
  41. lefx-0.3.0/src/lef/integrations/langgraph.py +143 -0
  42. lefx-0.3.0/src/lef/integrations/remote.py +206 -0
  43. lefx-0.3.0/src/lef/judges/__init__.py +43 -0
  44. lefx-0.3.0/src/lef/judges/llm.py +179 -0
  45. lefx-0.3.0/src/lef/judges/prompts.py +95 -0
  46. lefx-0.3.0/src/lef/judges/trajectory.py +87 -0
  47. lefx-0.3.0/src/lef/monitor.py +223 -0
  48. lefx-0.3.0/src/lef/online/__init__.py +15 -0
  49. lefx-0.3.0/src/lef/online/tracing.py +251 -0
  50. lefx-0.3.0/src/lef/output.py +285 -0
  51. lefx-0.3.0/src/lef/py.typed +0 -0
  52. lefx-0.3.0/src/lef/pytest_plugin.py +204 -0
  53. lefx-0.3.0/src/lef/redteam.py +292 -0
  54. lefx-0.3.0/src/lef/scorers/__init__.py +22 -0
  55. lefx-0.3.0/src/lef/scorers/builtin.py +241 -0
  56. lefx-0.3.0/src/lef/scorers/custom.py +110 -0
  57. lefx-0.3.0/src/lef/synthetic.py +373 -0
  58. lefx-0.3.0/src/lef/watch.py +121 -0
  59. lefx-0.3.0/tests/__init__.py +0 -0
  60. lefx-0.3.0/tests/conftest.py +28 -0
  61. lefx-0.3.0/tests/test_assertions.py +137 -0
  62. lefx-0.3.0/tests/test_cli.py +118 -0
  63. lefx-0.3.0/tests/test_cli_extended.py +185 -0
  64. lefx-0.3.0/tests/test_config.py +36 -0
  65. lefx-0.3.0/tests/test_config_integration.py +149 -0
  66. lefx-0.3.0/tests/test_core.py +164 -0
  67. lefx-0.3.0/tests/test_integrations.py +142 -0
  68. lefx-0.3.0/tests/test_judges.py +145 -0
  69. lefx-0.3.0/tests/test_loader.py +153 -0
  70. lefx-0.3.0/tests/test_new_modules.py +954 -0
  71. lefx-0.3.0/tests/test_new_modules_2.py +597 -0
  72. lefx-0.3.0/tests/test_online.py +220 -0
  73. lefx-0.3.0/tests/test_online_extended.py +189 -0
  74. lefx-0.3.0/tests/test_remote.py +124 -0
  75. lefx-0.3.0/tests/test_runner.py +316 -0
  76. lefx-0.3.0/tests/test_scorers.py +186 -0
  77. lefx-0.3.0/tests/test_summary_evaluators.py +171 -0
  78. lefx-0.3.0/tests/test_trajectory.py +32 -0
@@ -0,0 +1,8 @@
1
+ # LangSmith Configuration
2
+ LANGCHAIN_API_KEY=your-langsmith-api-key
3
+ LANGCHAIN_TRACING_V2=true
4
+ LANGCHAIN_PROJECT=my-project
5
+
6
+ # LLM Provider Keys (for LLM-as-Judge evaluators)
7
+ OPENAI_API_KEY=your-openai-api-key
8
+ ANTHROPIC_API_KEY=your-anthropic-api-key
@@ -0,0 +1,11 @@
1
+ version: 2
2
+ updates:
3
+ - package-ecosystem: "github-actions"
4
+ directory: "/"
5
+ schedule:
6
+ interval: "weekly"
7
+ - package-ecosystem: "pip"
8
+ directory: "/"
9
+ schedule:
10
+ interval: "weekly"
11
+ open-pull-requests-limit: 10
@@ -0,0 +1,136 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ # Cancel in-progress runs for the same branch/PR
10
+ concurrency:
11
+ group: ci-${{ github.ref }}
12
+ cancel-in-progress: true
13
+
14
+ # Restrict default permissions (public repo hardening)
15
+ permissions:
16
+ contents: read
17
+
18
+ jobs:
19
+ lint:
20
+ name: Lint
21
+ runs-on: [self-hosted, Linux]
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+ - uses: astral-sh/setup-uv@v4
25
+ with:
26
+ version: "latest"
27
+ - name: Install dependencies
28
+ run: uv sync --extra dev
29
+ - name: Ruff check
30
+ run: uv run ruff check src/ tests/
31
+ - name: Ruff format check
32
+ run: uv run ruff format --check src/ tests/
33
+
34
+ test-linux:
35
+ name: Test (Linux, Python ${{ matrix.python-version }})
36
+ runs-on: [self-hosted, Linux]
37
+ strategy:
38
+ fail-fast: false
39
+ matrix:
40
+ python-version: ["3.11", "3.12", "3.13"]
41
+ steps:
42
+ - uses: actions/checkout@v4
43
+ - uses: astral-sh/setup-uv@v4
44
+ with:
45
+ version: "latest"
46
+ - name: Set Python version
47
+ run: uv python install ${{ matrix.python-version }}
48
+ - name: Install dependencies
49
+ run: uv sync --python ${{ matrix.python-version }} --extra dev --extra all
50
+ - name: Run tests
51
+ run: uv run pytest -v --tb=short
52
+ - name: Verify CLI entry point
53
+ run: uv run lef --help
54
+ - name: Verify package import
55
+ run: uv run python -c "import lef; print(f'LEF {lef.__version__} — {len(lef.__all__)} exports')"
56
+
57
+ test-windows:
58
+ name: Test (Windows, Python ${{ matrix.python-version }})
59
+ runs-on: [self-hosted, Windows]
60
+ strategy:
61
+ fail-fast: false
62
+ matrix:
63
+ python-version: ["3.11", "3.12", "3.13"]
64
+ steps:
65
+ - uses: actions/checkout@v4
66
+ - uses: astral-sh/setup-uv@v4
67
+ with:
68
+ version: "latest"
69
+ - name: Set Python version
70
+ run: uv python install ${{ matrix.python-version }}
71
+ - name: Install dependencies
72
+ run: uv sync --python ${{ matrix.python-version }} --extra dev --extra all
73
+ - name: Run tests
74
+ run: uv run pytest -v --tb=short
75
+ - name: Verify CLI entry point
76
+ run: uv run lef --help
77
+ - name: Verify package import
78
+ run: uv run python -c "import lef; print(f'LEF {lef.__version__} — {len(lef.__all__)} exports')"
79
+
80
+ # test-macos:
81
+ # name: Test (macOS, Python ${{ matrix.python-version }})
82
+ # runs-on: [self-hosted, macOS]
83
+ # strategy:
84
+ # fail-fast: false
85
+ # matrix:
86
+ # python-version: ["3.11", "3.12", "3.13"]
87
+ # steps:
88
+ # - uses: actions/checkout@v4
89
+ # - uses: astral-sh/setup-uv@v4
90
+ # with:
91
+ # version: "latest"
92
+ # - name: Set Python version
93
+ # run: uv python install ${{ matrix.python-version }}
94
+ # - name: Install dependencies
95
+ # run: uv sync --python ${{ matrix.python-version }} --extra dev --extra all
96
+ # - name: Run tests
97
+ # run: uv run pytest -v --tb=short
98
+ # - name: Verify CLI entry point
99
+ # run: uv run lef --help
100
+ # - name: Verify package import
101
+ # run: uv run python -c "import lef; print(f'LEF {lef.__version__} — {len(lef.__all__)} exports')"
102
+
103
+ typecheck:
104
+ name: Type Check
105
+ runs-on: [self-hosted, Linux]
106
+ steps:
107
+ - uses: actions/checkout@v4
108
+ - uses: astral-sh/setup-uv@v4
109
+ with:
110
+ version: "latest"
111
+ - name: Install dependencies
112
+ run: uv sync --extra dev --extra all
113
+ - name: MyPy
114
+ run: uv run mypy src/lef/ --ignore-missing-imports
115
+
116
+ build:
117
+ name: Build Package
118
+ runs-on: [self-hosted, Linux]
119
+ needs: [lint, test-linux]
120
+ steps:
121
+ - uses: actions/checkout@v4
122
+ - uses: astral-sh/setup-uv@v4
123
+ with:
124
+ version: "latest"
125
+ - name: Build wheel and sdist
126
+ run: uv build
127
+ - name: Verify wheel installs
128
+ run: |
129
+ uv venv --clear /tmp/lef-install-test
130
+ uv pip install --python /tmp/lef-install-test/bin/python dist/*.whl
131
+ /tmp/lef-install-test/bin/python -c "import lef; print(f'LEF {lef.__version__}')"
132
+ - uses: actions/upload-artifact@v4
133
+ with:
134
+ name: dist
135
+ path: dist/
136
+ retention-days: 30
@@ -0,0 +1,179 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ # Restrict default permissions (public repo hardening)
9
+ permissions:
10
+ contents: read
11
+
12
+ jobs:
13
+ validate:
14
+ name: Validate Tag
15
+ runs-on: [self-hosted, Linux]
16
+ outputs:
17
+ version: ${{ steps.version.outputs.version }}
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ - name: Extract version from tag
21
+ id: version
22
+ run: |
23
+ TAG="${GITHUB_REF#refs/tags/v}"
24
+ echo "version=$TAG" >> "$GITHUB_OUTPUT"
25
+ echo "Release version: $TAG"
26
+ - uses: astral-sh/setup-uv@v4
27
+ with:
28
+ version: "latest"
29
+ - name: Verify tag matches pyproject.toml version
30
+ run: |
31
+ PKG_VERSION=$(uv run python -c "import lef; print(lef.__version__)")
32
+ TAG_VERSION="${GITHUB_REF#refs/tags/v}"
33
+ if [ "$PKG_VERSION" != "$TAG_VERSION" ]; then
34
+ echo "ERROR: Tag version ($TAG_VERSION) does not match package version ($PKG_VERSION)"
35
+ exit 1
36
+ fi
37
+ echo "Version match confirmed: $PKG_VERSION"
38
+
39
+ test-linux:
40
+ name: Test (Linux, Python ${{ matrix.python-version }})
41
+ runs-on: [self-hosted, Linux]
42
+ needs: [validate]
43
+ strategy:
44
+ fail-fast: false
45
+ matrix:
46
+ python-version: ["3.11", "3.12", "3.13"]
47
+ steps:
48
+ - uses: actions/checkout@v4
49
+ - uses: astral-sh/setup-uv@v4
50
+ with:
51
+ version: "latest"
52
+ - name: Set Python version
53
+ run: uv python install ${{ matrix.python-version }}
54
+ - name: Install dependencies
55
+ run: uv sync --python ${{ matrix.python-version }} --extra dev --extra all
56
+ - name: Run tests
57
+ run: uv run pytest -v --tb=short
58
+
59
+ test-windows:
60
+ name: Test (Windows, Python ${{ matrix.python-version }})
61
+ runs-on: [self-hosted, Windows]
62
+ needs: [validate]
63
+ strategy:
64
+ fail-fast: false
65
+ matrix:
66
+ python-version: ["3.11", "3.12", "3.13"]
67
+ steps:
68
+ - uses: actions/checkout@v4
69
+ - uses: astral-sh/setup-uv@v4
70
+ with:
71
+ version: "latest"
72
+ - name: Set Python version
73
+ run: uv python install ${{ matrix.python-version }}
74
+ - name: Install dependencies
75
+ run: uv sync --python ${{ matrix.python-version }} --extra dev --extra all
76
+ - name: Run tests
77
+ run: uv run pytest -v --tb=short
78
+
79
+ # test-macos:
80
+ # name: Test (macOS, Python ${{ matrix.python-version }})
81
+ # runs-on: [self-hosted, macOS]
82
+ # needs: [validate]
83
+ # strategy:
84
+ # fail-fast: false
85
+ # matrix:
86
+ # python-version: ["3.11", "3.12", "3.13"]
87
+ # steps:
88
+ # - uses: actions/checkout@v4
89
+ # - uses: astral-sh/setup-uv@v4
90
+ # with:
91
+ # version: "latest"
92
+ # - name: Set Python version
93
+ # run: uv python install ${{ matrix.python-version }}
94
+ # - name: Install dependencies
95
+ # run: uv sync --python ${{ matrix.python-version }} --extra dev --extra all
96
+ # - name: Run tests
97
+ # run: uv run pytest -v --tb=short
98
+
99
+ build:
100
+ name: Build Package
101
+ runs-on: [self-hosted, Linux]
102
+ needs: [test-linux, test-windows]
103
+ # needs: [test-linux, test-windows, test-macos] # Uncomment when macOS is enabled
104
+ permissions:
105
+ contents: read
106
+ steps:
107
+ - uses: actions/checkout@v4
108
+ - uses: astral-sh/setup-uv@v4
109
+ with:
110
+ version: "latest"
111
+ - name: Build wheel and sdist
112
+ run: uv build
113
+ - name: Verify wheel installs cleanly
114
+ run: |
115
+ uv venv --clear /tmp/lef-release-test
116
+ uv pip install --python /tmp/lef-release-test/bin/python dist/*.whl
117
+ /tmp/lef-release-test/bin/python -c "import lef; print(f'LEF {lef.__version__}')"
118
+ - uses: actions/upload-artifact@v4
119
+ with:
120
+ name: dist
121
+ path: dist/
122
+ retention-days: 90
123
+
124
+ publish-pypi:
125
+ name: Publish to PyPI
126
+ runs-on: [self-hosted, Linux]
127
+ needs: [build]
128
+ permissions:
129
+ contents: read
130
+ id-token: write # Required for trusted publishing
131
+ environment:
132
+ name: pypi
133
+ url: https://pypi.org/project/lefx/
134
+ steps:
135
+ - uses: actions/download-artifact@v4
136
+ with:
137
+ name: dist
138
+ path: dist/
139
+ - name: Publish to PyPI
140
+ uses: pypa/gh-action-pypi-publish@release/v1
141
+ # Uses trusted publishing (OIDC) — no API token needed
142
+ # Requires PyPI project to be configured for GitHub OIDC
143
+ # See: https://docs.pypi.org/trusted-publishers/
144
+
145
+ github-release:
146
+ name: GitHub Release
147
+ runs-on: [self-hosted, Linux]
148
+ needs: [build]
149
+ permissions:
150
+ contents: write # Required to create releases
151
+ steps:
152
+ - uses: actions/checkout@v4
153
+ - uses: actions/download-artifact@v4
154
+ with:
155
+ name: dist
156
+ path: dist/
157
+ - name: Create GitHub Release
158
+ uses: softprops/action-gh-release@v2
159
+ with:
160
+ files: dist/*
161
+ generate_release_notes: true
162
+ name: "LEF ${{ github.ref_name }}"
163
+
164
+ # publish-macos:
165
+ # # If you ever need macOS-specific wheels (e.g. for native extensions)
166
+ # name: Build macOS Wheel
167
+ # runs-on: [self-hosted, macOS]
168
+ # needs: [test-macos]
169
+ # steps:
170
+ # - uses: actions/checkout@v4
171
+ # - uses: astral-sh/setup-uv@v4
172
+ # with:
173
+ # version: "latest"
174
+ # - name: Build
175
+ # run: uv build
176
+ # - uses: actions/upload-artifact@v4
177
+ # with:
178
+ # name: dist-macos
179
+ # path: dist/
lefx-0.3.0/.gitignore ADDED
@@ -0,0 +1,31 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.so
5
+ .lef/
6
+ dist/
7
+ build/
8
+ *.egg-info/
9
+ *.egg
10
+ .eggs/
11
+ *.whl
12
+ .venv/
13
+ venv/
14
+ env/
15
+ .env
16
+ .env.*
17
+ !.env.example
18
+ *.log
19
+ .mypy_cache/
20
+ .ruff_cache/
21
+ .pytest_cache/
22
+ htmlcov/
23
+ .coverage
24
+ .coverage.*
25
+ coverage.xml
26
+ *.cover
27
+ .hypothesis/
28
+ .tox/
29
+ .nox/
30
+ .uv/
31
+ uv.lock
@@ -0,0 +1,75 @@
1
+ # Changelog
2
+
3
+ All notable changes to LEF will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.3.0] - 2026-03-30
9
+
10
+ ### Changed
11
+ - Renamed PyPI package from `lef` to `lefx` (import remains `import lef`)
12
+
13
+ ## [0.2.0] - 2026-03-28
14
+
15
+ ### Added
16
+ - **Result export**: JSON, CSV, and JUnit XML export via `--output` flag and `export_results()` API
17
+ - **Rich terminal output**: `format_results_table()` with per-metric summary, min/max/avg, and threshold pass/fail
18
+ - **Git-aware metadata**: Auto-injects branch, commit SHA, author into experiment metadata; detects CI environments (GitHub Actions, Azure DevOps, GitLab CI, Jenkins)
19
+ - **Baseline management**: `save_baseline()`, `load_baseline()`, `compare_results()` for regression detection; `lef baseline list/delete` CLI
20
+ - **Experiment comparison**: `ComparisonReport` with table/markdown output and `compare_experiments()` for LangSmith experiments
21
+ - **Result caching**: `ResultCache` with content-addressable hashing and TTL for avoiding re-invocation of expensive targets
22
+ - **Watch mode**: `--watch` flag for auto-rerun on file changes during iterative development
23
+ - **CI/CD PR comments**: `post_github_comment()` and `post_azdo_comment()` for posting results to GitHub and Azure DevOps PRs
24
+ - **QA endpoint testing**: `lef qa` subcommand for testing deployed HTTP endpoints against datasets
25
+ - **pytest plugin**: `lef_eval` fixture and config-file collection via `pytest11` entry point
26
+ - **Synthetic data generation**: `generate_from_docs()`, `generate_from_traces()`, `generate_adversarial()`, `diversify_dataset()`
27
+ - **Production monitoring**: `MonitorDaemon` for continuous evaluation of LangSmith runs with threshold alerting
28
+ - **Red-team testing**: `run_redteam()` with 6 attack categories (prompt injection, jailbreak, PII extraction, hallucination inducement, toxicity, bias) and 3 built-in safety scorers
29
+ - **Remote targets**: HTTP URLs supported as targets in config files via `create_remote_target()`
30
+ - **File path targets**: Target functions can be referenced by file path (`/path/to/file.py:function`)
31
+ - **Multi-config composition**: Pass multiple YAML configs to merge evaluators, thresholds, and metadata
32
+ - **CLI subcommands**: `compare`, `baseline`, `qa`, `monitor`, `redteam`, `dataset` (pull/push/diff/generate)
33
+ - **`--version` flag**: `lef -v` / `lef --version` shows version
34
+ - `py.typed` marker for PEP 561 type information
35
+ - 84 public API exports (up from 52)
36
+
37
+ ### Fixed
38
+ - Outputs kwarg override bug in dual-signature resolution across decorators and base classes
39
+ - `pass_rate` treating missing feedback as passing (now correctly treats as failing)
40
+ - Client reuse in `online/tracing.py` (singleton default client avoids per-call overhead)
41
+ - GitHub API URL for comment updates (was missing `/issues/` path segment)
42
+ - Monitor `_seen_run_ids` used unordered set for eviction (now uses OrderedDict)
43
+ - `compare_results()` now handles baseline dict format for both parameters
44
+ - `_load_config()` rejects non-dict YAML (empty files, arrays) with clear error
45
+ - Malformed config files produce clean error messages instead of raw tracebacks
46
+ - Duplicate `-v`/`--verbose` flag between parent and run subparser
47
+ - `lef` with no arguments now exits with code 2 (was incorrectly 0)
48
+
49
+ ### Changed
50
+ - Version bumped to 0.2.0
51
+ - Development Status classifier changed from Alpha to Beta
52
+ - `-v` flag changed from `--verbose` to `--version` (use `--verbose` for debug logging)
53
+ - `pyproject.toml` extras now use self-references to avoid duplicate version pins
54
+
55
+ ## [0.1.0] - 2024-12-01
56
+
57
+ ### Added
58
+ - Core evaluation framework with `EvalResult`, `EvalResultBatch`, and `JudgeModel` types
59
+ - `@scorer` and `@evaluator` decorators for creating LangSmith-compatible evaluators
60
+ - Built-in scorers: `exact_match`, `contains`, `regex_match`, `json_match`
61
+ - Custom scorer factories: `create_scorer`, `create_composite_scorer`
62
+ - 14 pre-built LLM-as-Judge evaluators (correctness, safety, hallucination, RAG, etc.)
63
+ - Custom judge creation via `create_judge()` with openevals integration
64
+ - Agent trajectory evaluation via `create_trajectory_evaluator` and `create_trajectory_judge`
65
+ - Dataset runner: `run_eval`, `arun_eval`, `EvalRunner`, `create_dataset`
66
+ - Local dataset loading from YAML, JSON, and CSV files
67
+ - Online evaluation: `evaluate_run`, `OnlineEvaluator`, `create_rule`
68
+ - LangChain integration: `evaluate_chain`, `create_chain_target` (sync + async)
69
+ - LangGraph integration: `evaluate_graph`, `create_graph_target` (sync + async)
70
+ - CI/CD gating with `assert_scores` and `check_scores`
71
+ - CLI tool (`lef run`) for running evaluation suites from config files
72
+ - `LefConfig` for environment-based configuration
73
+ - Dual-signature compatibility (legacy `run, example` and new-style kwargs)
74
+ - 222 tests with full mocking (no API calls required)
75
+ - Comprehensive README with quick-start examples and API reference
lefx-0.3.0/CLAUDE.md ADDED
@@ -0,0 +1,101 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## What is this project?
6
+
7
+ LEF (LangSmith Evaluation Framework) is a plug-and-play evaluation system for LangChain, LangGraph, and LangSmith projects. It wraps `langsmith`, `openevals`, and `agentevals` into a unified framework with built-in QA/CI support.
8
+
9
+ ## Common Commands
10
+
11
+ ```bash
12
+ # Install dependencies
13
+ uv sync --extra dev --extra all
14
+
15
+ # Run all tests
16
+ uv run pytest
17
+
18
+ # Run a single test file
19
+ uv run pytest tests/test_scorers.py
20
+
21
+ # Run a single test by name
22
+ uv run pytest tests/test_scorers.py -k "test_exact_match"
23
+
24
+ # Lint
25
+ uv run ruff check src/ tests/
26
+
27
+ # Lint with auto-fix
28
+ uv run ruff check --fix src/ tests/
29
+
30
+ # Type check
31
+ uv run mypy src/
32
+
33
+ # Run the CLI
34
+ uv run lef run <config.yaml>
35
+ ```
36
+
37
+ ## Architecture
38
+
39
+ Source lives in `src/lef/`. All public API is exported from `src/lef/__init__.py` via `__all__`.
40
+
41
+ ### Key design decisions
42
+
43
+ - **EvalResult is a `dict` subclass** (not a Pydantic model) — this is required because LangSmith's `evaluate()` expects dict-like results. It also provides property access (`.key`, `.score`, `.comment`, `.metadata`).
44
+ - **Dual-signature compatibility** — all evaluators accept both LangSmith's legacy `(run, example)` and new-style `(inputs=, outputs=, reference_outputs=)` keyword signatures. The `@scorer` decorator in `core/decorators.py` handles this normalization.
45
+ - **Decorators normalize return types** — `@scorer` accepts `bool`, `float`, `int`, `dict`, or `EvalResult` returns and always produces a LangSmith-compatible dict.
46
+ - **openevals judges are thin wrappers** — `judges/llm.py` factory functions (e.g., `correctness_judge()`) wrap `openevals.create_llm_as_judge()` with LEF defaults and return callables.
47
+ - **Local datasets don't need LangSmith** — `load_examples()` from `datasets/loader.py` supports YAML/JSON/CSV. Combined with `upload_results=False`, evaluation runs fully offline.
48
+
49
+ ### Module layout
50
+
51
+ **Core:**
52
+ - `core/types.py` — `EvalResult`, `EvalResultBatch`, `JudgeModel` enum, evaluator protocols
53
+ - `core/base.py` — `BaseEvaluator`, `AsyncBaseEvaluator` abstract classes
54
+ - `core/decorators.py` — `@scorer` and `@evaluator` decorators
55
+
56
+ **Scorers & Judges:**
57
+ - `scorers/builtin.py` — Rule-based scorers: `exact_match`, `contains`, `regex_match`, `json_match`, `mean_score`, `pass_rate`
58
+ - `scorers/custom.py` — `create_scorer`, `create_composite_scorer` factories
59
+ - `judges/llm.py` — 20+ LLM-as-judge factories (correctness, safety, hallucination, RAG, etc.)
60
+ - `judges/trajectory.py` — Agent trajectory evaluators wrapping `agentevals`
61
+
62
+ **Data & Runner:**
63
+ - `datasets/runner.py` — `run_eval`, `arun_eval`, `EvalRunner`, `create_dataset`
64
+ - `datasets/loader.py` — `load_examples` for local YAML/JSON/CSV files
65
+
66
+ **CLI:**
67
+ - `cli.py` — CLI entry point with 7 subcommands: `run`, `compare`, `baseline`, `qa`, `monitor`, `redteam`, `dataset`
68
+
69
+ **Integrations:**
70
+ - `integrations/langchain.py` — LangChain chain evaluation
71
+ - `integrations/langgraph.py` — LangGraph graph evaluation
72
+ - `integrations/remote.py` — HTTP endpoint targets (`create_remote_target`)
73
+
74
+ **v0.2.0 Modules:**
75
+ - `output.py` — Rich terminal tables + JSON/CSV/JUnit XML export
76
+ - `git_context.py` — Git-aware experiment metadata (branch, commit, CI detection)
77
+ - `compare.py` — Baseline management and experiment comparison
78
+ - `cache.py` — Content-addressable result caching with TTL
79
+ - `watch.py` — File-watching mode for iterative development
80
+ - `ci/github.py` — GitHub PR comment integration
81
+ - `ci/azuredevops.py` — Azure DevOps PR comment integration
82
+ - `pytest_plugin.py` — pytest plugin with `lef_eval` fixture (entry point: `pytest11`)
83
+ - `synthetic.py` — Synthetic dataset generation from documents/traces
84
+ - `monitor.py` — Production monitoring daemon with threshold alerting
85
+ - `redteam.py` — Adversarial red-team testing with 6 attack categories
86
+ - `online/tracing.py` — Online evaluation of production LangSmith runs
87
+
88
+ ## Testing
89
+
90
+ - All tests in `tests/`, using pytest with `asyncio_mode = "auto"` (no need for `@pytest.mark.asyncio`)
91
+ - Tests mock LLM APIs via `unittest.mock.patch` — no real API calls needed
92
+ - Shared fixtures in `tests/conftest.py`: `sample_inputs`, `sample_outputs`, `sample_reference_outputs`, `wrong_outputs`, `empty_outputs`
93
+
94
+ ## When Adding New Evaluators
95
+
96
+ 1. Add the factory function to `src/lef/judges/llm.py` (for LLM judges) or `src/lef/scorers/builtin.py` (for rule-based)
97
+ 2. Export from the sub-package `__init__.py`
98
+ 3. Export from the top-level `src/lef/__init__.py` and add to `__all__`
99
+ 4. Add to `_BUILTIN_EVALUATORS` in `src/lef/cli.py` for CLI support
100
+ 5. Add tests in `tests/`
101
+ 6. Update the README table
lefx-0.3.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 bogware
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
lefx-0.3.0/MANIFEST.in ADDED
@@ -0,0 +1,5 @@
1
+ include LICENSE
2
+ include README.md
3
+ include CHANGELOG.md
4
+ include pyproject.toml
5
+ recursive-include src/lef *.py