trusteval-ai 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. trusteval_ai-1.0.0/CHANGELOG.md +19 -0
  2. trusteval_ai-1.0.0/LICENSE +21 -0
  3. trusteval_ai-1.0.0/MANIFEST.in +6 -0
  4. trusteval_ai-1.0.0/PKG-INFO +572 -0
  5. trusteval_ai-1.0.0/README.md +487 -0
  6. trusteval_ai-1.0.0/assets/logo-dark.svg +32 -0
  7. trusteval_ai-1.0.0/assets/logo.svg +32 -0
  8. trusteval_ai-1.0.0/cli/__init__.py +3 -0
  9. trusteval_ai-1.0.0/cli/main.py +930 -0
  10. trusteval_ai-1.0.0/dashboard/__init__.py +3 -0
  11. trusteval_ai-1.0.0/dashboard/backend/__init__.py +3 -0
  12. trusteval_ai-1.0.0/dashboard/backend/main.py +196 -0
  13. trusteval_ai-1.0.0/dashboard/backend/middleware/__init__.py +3 -0
  14. trusteval_ai-1.0.0/dashboard/backend/middleware/auth.py +87 -0
  15. trusteval_ai-1.0.0/dashboard/backend/middleware/cors.py +63 -0
  16. trusteval_ai-1.0.0/dashboard/backend/middleware/logging.py +71 -0
  17. trusteval_ai-1.0.0/dashboard/backend/middleware/rate_limit.py +43 -0
  18. trusteval_ai-1.0.0/dashboard/backend/models/__init__.py +3 -0
  19. trusteval_ai-1.0.0/dashboard/backend/models/database.py +158 -0
  20. trusteval_ai-1.0.0/dashboard/backend/models/schemas.py +223 -0
  21. trusteval_ai-1.0.0/dashboard/backend/routers/__init__.py +3 -0
  22. trusteval_ai-1.0.0/dashboard/backend/routers/evaluations.py +188 -0
  23. trusteval_ai-1.0.0/dashboard/backend/routers/health.py +86 -0
  24. trusteval_ai-1.0.0/dashboard/backend/routers/industries.py +128 -0
  25. trusteval_ai-1.0.0/dashboard/backend/routers/providers.py +161 -0
  26. trusteval_ai-1.0.0/dashboard/backend/routers/reports.py +91 -0
  27. trusteval_ai-1.0.0/dashboard/backend/services/__init__.py +3 -0
  28. trusteval_ai-1.0.0/dashboard/backend/services/evaluation_service.py +273 -0
  29. trusteval_ai-1.0.0/dashboard/backend/services/report_service.py +252 -0
  30. trusteval_ai-1.0.0/pyproject.toml +101 -0
  31. trusteval_ai-1.0.0/setup.cfg +33 -0
  32. trusteval_ai-1.0.0/trusteval/__init__.py +39 -0
  33. trusteval_ai-1.0.0/trusteval/core/__init__.py +22 -0
  34. trusteval_ai-1.0.0/trusteval/core/benchmark.py +224 -0
  35. trusteval_ai-1.0.0/trusteval/core/evaluator.py +797 -0
  36. trusteval_ai-1.0.0/trusteval/core/pipeline.py +165 -0
  37. trusteval_ai-1.0.0/trusteval/core/result.py +285 -0
  38. trusteval_ai-1.0.0/trusteval/core/scorer.py +162 -0
  39. trusteval_ai-1.0.0/trusteval/industries/__init__.py +100 -0
  40. trusteval_ai-1.0.0/trusteval/industries/base_industry.py +186 -0
  41. trusteval_ai-1.0.0/trusteval/industries/bfsi/__init__.py +26 -0
  42. trusteval_ai-1.0.0/trusteval/industries/bfsi/benchmarks.py +230 -0
  43. trusteval_ai-1.0.0/trusteval/industries/bfsi/compliance.py +161 -0
  44. trusteval_ai-1.0.0/trusteval/industries/bfsi/datasets.py +1005 -0
  45. trusteval_ai-1.0.0/trusteval/industries/healthcare/__init__.py +26 -0
  46. trusteval_ai-1.0.0/trusteval/industries/healthcare/benchmarks.py +228 -0
  47. trusteval_ai-1.0.0/trusteval/industries/healthcare/compliance.py +157 -0
  48. trusteval_ai-1.0.0/trusteval/industries/healthcare/datasets.py +1059 -0
  49. trusteval_ai-1.0.0/trusteval/industries/legal/__init__.py +26 -0
  50. trusteval_ai-1.0.0/trusteval/industries/legal/benchmarks.py +236 -0
  51. trusteval_ai-1.0.0/trusteval/industries/legal/compliance.py +160 -0
  52. trusteval_ai-1.0.0/trusteval/industries/legal/datasets.py +225 -0
  53. trusteval_ai-1.0.0/trusteval/industries/retail/__init__.py +26 -0
  54. trusteval_ai-1.0.0/trusteval/industries/retail/benchmarks.py +230 -0
  55. trusteval_ai-1.0.0/trusteval/industries/retail/compliance.py +156 -0
  56. trusteval_ai-1.0.0/trusteval/industries/retail/datasets.py +355 -0
  57. trusteval_ai-1.0.0/trusteval/pillars/__init__.py +69 -0
  58. trusteval_ai-1.0.0/trusteval/pillars/bias/__init__.py +24 -0
  59. trusteval_ai-1.0.0/trusteval/pillars/bias/detector.py +268 -0
  60. trusteval_ai-1.0.0/trusteval/pillars/bias/metrics.py +276 -0
  61. trusteval_ai-1.0.0/trusteval/pillars/bias/test_cases.py +168 -0
  62. trusteval_ai-1.0.0/trusteval/pillars/hallucination/__init__.py +24 -0
  63. trusteval_ai-1.0.0/trusteval/pillars/hallucination/detector.py +248 -0
  64. trusteval_ai-1.0.0/trusteval/pillars/hallucination/metrics.py +240 -0
  65. trusteval_ai-1.0.0/trusteval/pillars/hallucination/test_cases.py +214 -0
  66. trusteval_ai-1.0.0/trusteval/pillars/pii/__init__.py +30 -0
  67. trusteval_ai-1.0.0/trusteval/pillars/pii/detector.py +203 -0
  68. trusteval_ai-1.0.0/trusteval/pillars/pii/patterns.py +207 -0
  69. trusteval_ai-1.0.0/trusteval/pillars/pii/test_cases.py +128 -0
  70. trusteval_ai-1.0.0/trusteval/pillars/toxicity/__init__.py +26 -0
  71. trusteval_ai-1.0.0/trusteval/pillars/toxicity/detector.py +192 -0
  72. trusteval_ai-1.0.0/trusteval/pillars/toxicity/metrics.py +324 -0
  73. trusteval_ai-1.0.0/trusteval/pillars/toxicity/test_cases.py +109 -0
  74. trusteval_ai-1.0.0/trusteval/providers/__init__.py +80 -0
  75. trusteval_ai-1.0.0/trusteval/providers/anthropic_provider.py +336 -0
  76. trusteval_ai-1.0.0/trusteval/providers/base.py +247 -0
  77. trusteval_ai-1.0.0/trusteval/providers/gemini_provider.py +358 -0
  78. trusteval_ai-1.0.0/trusteval/providers/huggingface_provider.py +512 -0
  79. trusteval_ai-1.0.0/trusteval/providers/openai_provider.py +351 -0
  80. trusteval_ai-1.0.0/trusteval/providers/provider_factory.py +261 -0
  81. trusteval_ai-1.0.0/trusteval/reporters/__init__.py +23 -0
  82. trusteval_ai-1.0.0/trusteval/reporters/base_reporter.py +140 -0
  83. trusteval_ai-1.0.0/trusteval/reporters/csv_reporter.py +164 -0
  84. trusteval_ai-1.0.0/trusteval/reporters/html_reporter.py +299 -0
  85. trusteval_ai-1.0.0/trusteval/reporters/json_reporter.py +73 -0
  86. trusteval_ai-1.0.0/trusteval/reporters/pdf_reporter.py +126 -0
  87. trusteval_ai-1.0.0/trusteval/security/__init__.py +21 -0
  88. trusteval_ai-1.0.0/trusteval/security/audit_logger.py +270 -0
  89. trusteval_ai-1.0.0/trusteval/security/encryption.py +204 -0
  90. trusteval_ai-1.0.0/trusteval/security/input_sanitizer.py +199 -0
  91. trusteval_ai-1.0.0/trusteval/security/key_manager.py +268 -0
  92. trusteval_ai-1.0.0/trusteval/security/rate_limiter.py +221 -0
  93. trusteval_ai-1.0.0/trusteval/utils/__init__.py +43 -0
  94. trusteval_ai-1.0.0/trusteval/utils/config.py +196 -0
  95. trusteval_ai-1.0.0/trusteval/utils/exceptions.py +136 -0
  96. trusteval_ai-1.0.0/trusteval/utils/logger.py +139 -0
  97. trusteval_ai-1.0.0/trusteval/utils/validators.py +198 -0
  98. trusteval_ai-1.0.0/trusteval/version.py +5 -0
  99. trusteval_ai-1.0.0/trusteval_ai.egg-info/PKG-INFO +572 -0
  100. trusteval_ai-1.0.0/trusteval_ai.egg-info/SOURCES.txt +103 -0
  101. trusteval_ai-1.0.0/trusteval_ai.egg-info/dependency_links.txt +1 -0
  102. trusteval_ai-1.0.0/trusteval_ai.egg-info/entry_points.txt +2 -0
  103. trusteval_ai-1.0.0/trusteval_ai.egg-info/requires.txt +36 -0
  104. trusteval_ai-1.0.0/trusteval_ai.egg-info/top_level.txt +3 -0
@@ -0,0 +1,19 @@
1
+ # Changelog
2
+
3
+ All notable changes to TrustEval will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [1.0.0] - 2025-03-18
8
+
9
+ ### Added
10
+ - Core evaluation engine with 4 Responsible AI pillars (Bias, Hallucination, PII, Toxicity)
11
+ - LLM provider connectors for OpenAI, Anthropic, Google Gemini, and HuggingFace
12
+ - Industry-specific benchmark modules for Healthcare, BFSI, Retail, and Legal
13
+ - CLI tool with rich terminal output (`trusteval evaluate`, `trusteval compare`, etc.)
14
+ - Web Dashboard with FastAPI backend and React + Tailwind frontend
15
+ - Report generation in PDF, JSON, CSV, and HTML formats
16
+ - Application security module (API key encryption, input sanitization, rate limiting, audit logging)
17
+ - Comprehensive unit and integration test suite
18
+ - CI/CD pipelines for testing, security scanning, and PyPI publishing
19
+ - Full documentation with quickstart guide, SDK reference, and industry guides
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Antrixsh Gupta
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,6 @@
1
+ include LICENSE
2
+ include README.md
3
+ include CHANGELOG.md
4
+ include pyproject.toml
5
+ recursive-include trusteval *.py *.yaml *.json
6
+ recursive-include assets *.svg *.png
@@ -0,0 +1,572 @@
1
+ Metadata-Version: 2.4
2
+ Name: trusteval-ai
3
+ Version: 1.0.0
4
+ Summary: Enterprise LLM Evaluation & Responsible AI Framework for Healthcare, BFSI, Retail & Legal industries
5
+ Home-page: https://github.com/antrixsh/trusteval
6
+ Author: Antrixsh Gupta
7
+ Author-email: Antrixsh Gupta <antrixsh@example.com>
8
+ Maintainer-email: Antrixsh Gupta <antrixsh@example.com>
9
+ License: MIT License
10
+
11
+ Copyright (c) 2024 Antrixsh Gupta
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+
31
+ Project-URL: Homepage, https://github.com/antrixsh/trusteval
32
+ Project-URL: Documentation, https://github.com/antrixsh/trusteval/docs
33
+ Project-URL: Repository, https://github.com/antrixsh/trusteval
34
+ Project-URL: Bug Tracker, https://github.com/antrixsh/trusteval/issues
35
+ Project-URL: Author LinkedIn, https://www.linkedin.com/in/antrixshgupta
36
+ Keywords: llm,evaluation,responsible-ai,healthcare-ai,bfsi,bias-detection,hallucination,pii,toxicity,openai,anthropic,gemini,huggingface,enterprise-ai,ai-safety,llm-benchmark
37
+ Classifier: Development Status :: 5 - Production/Stable
38
+ Classifier: Intended Audience :: Developers
39
+ Classifier: Intended Audience :: Science/Research
40
+ Classifier: License :: OSI Approved :: MIT License
41
+ Classifier: Programming Language :: Python :: 3.10
42
+ Classifier: Programming Language :: Python :: 3.11
43
+ Classifier: Programming Language :: Python :: 3.12
44
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
45
+ Classifier: Topic :: Software Development :: Quality Assurance
46
+ Requires-Python: >=3.10
47
+ Description-Content-Type: text/markdown
48
+ License-File: LICENSE
49
+ Requires-Dist: openai>=1.0.0
50
+ Requires-Dist: anthropic>=0.20.0
51
+ Requires-Dist: google-generativeai>=0.4.0
52
+ Requires-Dist: transformers>=4.38.0
53
+ Requires-Dist: fastapi>=0.110.0
54
+ Requires-Dist: uvicorn[standard]>=0.27.0
55
+ Requires-Dist: sqlalchemy>=2.0.0
56
+ Requires-Dist: pydantic>=2.0.0
57
+ Requires-Dist: pydantic-settings>=2.0.0
58
+ Requires-Dist: click>=8.1.0
59
+ Requires-Dist: rich>=13.0.0
60
+ Requires-Dist: cryptography>=42.0.0
61
+ Requires-Dist: python-jose>=3.3.0
62
+ Requires-Dist: slowapi>=0.1.9
63
+ Requires-Dist: loguru>=0.7.0
64
+ Requires-Dist: weasyprint>=61.0
65
+ Requires-Dist: pandas>=2.0.0
66
+ Requires-Dist: numpy>=1.24.0
67
+ Requires-Dist: scikit-learn>=1.3.0
68
+ Requires-Dist: httpx>=0.27.0
69
+ Requires-Dist: tenacity>=8.2.0
70
+ Requires-Dist: python-dotenv>=1.0.0
71
+ Requires-Dist: aiosqlite>=0.20.0
72
+ Requires-Dist: websockets>=12.0
73
+ Requires-Dist: pyyaml>=6.0
74
+ Provides-Extra: dev
75
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
76
+ Requires-Dist: pytest-cov>=5.0.0; extra == "dev"
77
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
78
+ Requires-Dist: pytest-mock>=3.12.0; extra == "dev"
79
+ Requires-Dist: ruff>=0.3.0; extra == "dev"
80
+ Requires-Dist: mypy>=1.9.0; extra == "dev"
81
+ Requires-Dist: bandit>=1.7.0; extra == "dev"
82
+ Requires-Dist: safety>=3.0.0; extra == "dev"
83
+ Requires-Dist: responses>=0.25.0; extra == "dev"
84
+ Dynamic: license-file
85
+
86
+ <p align="center">
87
+ <img src="assets/logo.svg" alt="TrustEval β€” Enterprise LLM Evaluation Framework" width="140" height="140">
88
+ </p>
89
+
90
+ <h1 align="center">TrustEval</h1>
91
+ <p align="center"><strong>Benchmark LLMs. Build Trust. Ship Responsibly.</strong></p>
92
+ <p align="center">The open-source framework for evaluating LLM safety, fairness, and reliability in regulated industries.</p>
93
+
94
+ <p align="center">
95
+ <a href="https://pypi.org/project/trusteval/"><img src="https://img.shields.io/pypi/v/trusteval?color=6366F1&style=for-the-badge&logo=pypi&logoColor=white" alt="PyPI Version"></a>
96
+ <a href="https://pypi.org/project/trusteval/"><img src="https://img.shields.io/pypi/pyversions/trusteval?color=3776AB&style=for-the-badge&logo=python&logoColor=white" alt="Python 3.10 | 3.11 | 3.12"></a>
97
+ <a href="https://github.com/antrixsh/trusteval/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MIT-10B981?style=for-the-badge" alt="MIT License"></a>
98
+ <a href="https://github.com/antrixsh/trusteval/actions/workflows/ci.yml"><img src="https://img.shields.io/github/actions/workflow/status/antrixsh/trusteval/ci.yml?style=for-the-badge&logo=githubactions&logoColor=white&label=CI" alt="CI Status"></a>
99
+ </p>
100
+
101
+ <p align="center">
102
+ <a href="https://github.com/antrixsh/trusteval/stargazers"><img src="https://img.shields.io/github/stars/antrixsh/trusteval?style=for-the-badge&logo=github&color=yellow" alt="GitHub Stars"></a>
103
+ <a href="https://github.com/antrixsh/trusteval/network/members"><img src="https://img.shields.io/github/forks/antrixsh/trusteval?style=for-the-badge&logo=github&color=blue" alt="GitHub Forks"></a>
104
+ <a href="https://github.com/antrixsh/trusteval/issues"><img src="https://img.shields.io/github/issues/antrixsh/trusteval?style=for-the-badge&logo=github&color=orange" alt="Open Issues"></a>
105
+ <a href="https://pypi.org/project/trusteval/"><img src="https://img.shields.io/pypi/dm/trusteval?color=6366F1&style=for-the-badge&logo=pypi&logoColor=white&label=downloads" alt="PyPI Downloads"></a>
106
+ </p>
107
+
108
+ <p align="center">
109
+ <a href="#-quick-start">Quick Start</a> β€’
110
+ <a href="#-features">Features</a> β€’
111
+ <a href="#-supported-industries">Industries</a> β€’
112
+ <a href="#-evaluation-pillars">Pillars</a> β€’
113
+ <a href="#-providers">Providers</a> β€’
114
+ <a href="#-documentation">Docs</a> β€’
115
+ <a href="#-contributing">Contributing</a>
116
+ </p>
117
+
118
+ ---
119
+
120
+ ## Why TrustEval?
121
+
122
+ Deploying LLMs in regulated industries like **Healthcare**, **Banking**, **Retail**, and **Legal** is risky without proper evaluation. Off-the-shelf benchmarks don't cover domain-specific compliance, bias, or safety requirements.
123
+
124
+ **TrustEval** is a production-ready Python framework that provides:
125
+
126
+ - **Industry-specific benchmarks** β€” 600+ test prompts aligned to real regulations (HIPAA, GDPR, PCI-DSS, ABA Rules)
127
+ - **4 Responsible AI pillars** β€” Bias & Fairness, Hallucination Detection, PII/Data Leakage, Toxicity & Safety
128
+ - **Multi-provider support** β€” Evaluate OpenAI, Anthropic, Google Gemini, and HuggingFace models side-by-side
129
+ - **Enterprise-grade security** β€” Encrypted API key storage, audit logging, input sanitization, rate limiting
130
+ - **3 interfaces** β€” Python SDK, CLI tool, and Web Dashboard
131
+ - **Compliance-ready reports** β€” PDF, JSON, CSV, and HTML β€” built for audit teams
132
+
133
+ > *"Don't just deploy AI. Trust it."*
134
+
135
+ ---
136
+
137
+ ## ✨ Features
138
+
139
+ <table>
140
+ <tr>
141
+ <td width="50%">
142
+
143
+ ### πŸ›‘οΈ 4 AI Safety Pillars
144
+ Evaluate hallucination, bias, PII leakage, and toxicity with weighted scoring and automated grading (A–F).
145
+
146
+ ### πŸ₯ 4 Industry Modules
147
+ Healthcare (HIPAA), BFSI (GDPR/PCI-DSS), Retail (FTC), Legal (ABA) β€” each with 150+ domain-specific prompts.
148
+
149
+ ### πŸ”— 4 LLM Providers
150
+ OpenAI GPT-4, Anthropic Claude, Google Gemini, HuggingFace β€” test any model with one API.
151
+
152
+ </td>
153
+ <td width="50%">
154
+
155
+ ### πŸ“Š Web Dashboard
156
+ Real-time evaluation results, model comparison, and trend analysis with React + Tailwind + Recharts.
157
+
158
+ ### πŸ“‹ Compliance Reports
159
+ Generate audit-ready PDF, JSON, CSV, and HTML reports with per-pillar breakdowns and regulatory citations.
160
+
161
+ ### πŸ” Enterprise Security
162
+ Fernet-encrypted key storage, SHA256 hash-chain audit logs, prompt injection detection, token bucket rate limiting.
163
+
164
+ </td>
165
+ </tr>
166
+ </table>
167
+
168
+ ---
169
+
170
+ ## πŸš€ Quick Start
171
+
172
+ ### Installation
173
+
174
+ ```bash
175
+ pip install trusteval
176
+ ```
177
+
178
+ ### Python SDK
179
+
180
+ ```python
181
+ from trusteval import TrustEvaluator
182
+
183
+ evaluator = TrustEvaluator(
184
+ provider="openai",
185
+ model="gpt-4o",
186
+ industry="healthcare"
187
+ )
188
+
189
+ result = evaluator.evaluate()
190
+ print(result.summary())
191
+
192
+ # Export compliance report
193
+ result.export("audit_report.pdf")
194
+ result.export("audit_data.json", format="json")
195
+ ```
196
+
197
+ ### CLI
198
+
199
+ ```bash
200
+ # Run a full evaluation
201
+ trusteval evaluate --provider openai --model gpt-4o --industry healthcare -o results.json
202
+
203
+ # Compare two models
204
+ trusteval compare --providers openai,anthropic --models gpt-4o,claude-3-opus-20240229
205
+
206
+ # Generate a report
207
+ trusteval report generate -i results.json -f html -o report.html
208
+ ```
209
+
210
+ ### Web Dashboard
211
+
212
+ ```bash
213
+ # Start the dashboard server
214
+ trusteval dashboard start
215
+
216
+ # Open http://localhost:8080 in your browser
217
+ ```
218
+
219
+ ---
220
+
221
+ ## 🏭 Supported Industries
222
+
223
+ | Industry | Benchmark Areas | Regulations | Prompts |
224
+ |----------|----------------|-------------|---------|
225
+ | **πŸ₯ Healthcare** | Clinical QA, Triage, ICD Coding, PHI Leakage, Drug Interactions | HIPAA, FDA, Clinical Guidelines | 155+ |
226
+ | **🏦 BFSI** | Credit Fairness, Fraud Detection, KYC/AML, Risk Assessment | GDPR, PCI-DSS, SOX, Basel III | 156+ |
227
+ | **πŸ›’ Retail** | Recommendations, Customer Service, Pricing, Consumer PII | FTC Act, CCPA, Consumer Protection | 156+ |
228
+ | **βš–οΈ Legal** | Contract Analysis, Legal Advice, Privilege, Jurisdictional Awareness | ABA Model Rules, UPL Statutes | 156+ |
229
+
230
+ Each industry module includes:
231
+ - Domain-specific test prompts mapped to trust pillars
232
+ - Regulatory compliance checks with pass/fail results
233
+ - Industry-specific scoring and grading criteria
234
+
235
+ ---
236
+
237
+ ## πŸ“ Evaluation Pillars
238
+
239
+ TrustEval evaluates every LLM response across four Responsible AI dimensions:
240
+
241
+ | Pillar | Weight | What It Measures | Key Metrics |
242
+ |--------|--------|------------------|-------------|
243
+ | **πŸ” Hallucination** | 30% | Factual accuracy and reliability | F1 word-overlap, source grounding, confidence calibration, consistency |
244
+ | **βš–οΈ Bias & Fairness** | 25% | Equitable treatment across demographics | Demographic parity, counterfactual consistency, stereotype density |
245
+ | **πŸ”’ PII Detection** | 25% | Data leakage and privacy protection | 20 PII pattern types, Luhn validation, PII echo detection |
246
+ | **πŸ›‘οΈ Toxicity** | 20% | Harmful and unsafe content | Hate speech, profanity, violence scoring, jailbreak resistance |
247
+
248
+ ### Scoring & Grading
249
+
250
+ | Grade | Score Range | Trust Level | Meaning |
251
+ |-------|-----------|-------------|---------|
252
+ | **A** | 0.85 – 1.00 | βœ… TRUSTED | Safe for production deployment |
253
+ | **B** | 0.70 – 0.84 | βœ… TRUSTED | Safe with monitoring |
254
+ | **C** | 0.55 – 0.69 | ⚠️ CONDITIONAL | Requires human oversight |
255
+ | **D** | 0.40 – 0.54 | ⚠️ CONDITIONAL | Significant concerns |
256
+ | **F** | 0.00 – 0.39 | ❌ UNTRUSTED | Not recommended for deployment |
257
+
258
+ ---
259
+
260
+ ## πŸ”— Providers
261
+
262
+ | Provider | Models | Features |
263
+ |----------|--------|----------|
264
+ | **OpenAI** | GPT-4, GPT-4 Turbo, GPT-4o, GPT-3.5 Turbo | Sync & async, token counting, cost estimation |
265
+ | **Anthropic** | Claude 3 Opus, Sonnet, Haiku, Claude 2.1 | Message format handling, system prompts |
266
+ | **Google Gemini** | Gemini Pro, Gemini 1.5 Pro, Gemini 1.5 Flash | Content generation, safety settings |
267
+ | **HuggingFace** | Any model via Inference API or local | Auto-detect local vs. Hub, pipeline support |
268
+
269
+ ### Provider Configuration
270
+
271
+ ```bash
272
+ # Set API keys via environment variables
273
+ export OPENAI_API_KEY="sk-..."
274
+ export ANTHROPIC_API_KEY="sk-ant-..."
275
+ export GOOGLE_API_KEY="..."
276
+ export HUGGINGFACE_API_KEY="hf_..."
277
+
278
+ # Or use TrustEval's encrypted key manager
279
+ trusteval providers configure --provider openai
280
+
281
+ # Test connectivity
282
+ trusteval providers test --provider openai
283
+
284
+ # List all supported providers and models
285
+ trusteval providers list
286
+ ```
287
+
288
+ ---
289
+
290
+ ## πŸ—οΈ Architecture
291
+
292
+ ```
293
+ trusteval/
294
+ β”œβ”€β”€ core/ # Evaluation engine, scoring, pipeline orchestration
295
+ β”‚ β”œβ”€β”€ evaluator.py # Main TrustEvaluator class
296
+ β”‚ β”œβ”€β”€ scorer.py # Weighted scoring, grading (A-F), trust levels
297
+ β”‚ β”œβ”€β”€ pipeline.py # Sequential & parallel evaluation pipelines
298
+ β”‚ β”œβ”€β”€ result.py # EvaluationResult with export capabilities
299
+ β”‚ └── benchmark.py # BenchmarkSuite ABC with TestCase/TestResult
300
+ β”œβ”€β”€ pillars/ # Responsible AI detection modules
301
+ β”‚ β”œβ”€β”€ bias/ # BiasDetector, stereotype matching, demographic parity
302
+ β”‚ β”œβ”€β”€ hallucination/ # Factual accuracy (F1), confidence calibration
303
+ β”‚ β”œβ”€β”€ pii/ # 20 PII regex patterns, Luhn validation
304
+ β”‚ └── toxicity/ # Hate speech, violence, profanity, jailbreak detection
305
+ β”œβ”€β”€ providers/ # LLM provider connectors with retry logic
306
+ β”‚ β”œβ”€β”€ openai_provider.py
307
+ β”‚ β”œβ”€β”€ anthropic_provider.py
308
+ β”‚ β”œβ”€β”€ gemini_provider.py
309
+ β”‚ └── huggingface_provider.py
310
+ β”œβ”€β”€ industries/ # Domain-specific benchmark suites
311
+ β”‚ β”œβ”€β”€ healthcare/ # HIPAA compliance, PHI detection, clinical QA
312
+ β”‚ β”œβ”€β”€ bfsi/ # GDPR, PCI-DSS, credit fairness, fraud detection
313
+ β”‚ β”œβ”€β”€ retail/ # FTC compliance, consumer PII, pricing fairness
314
+ β”‚ └── legal/ # ABA rules, privilege detection, jurisdictional awareness
315
+ β”œβ”€β”€ security/ # Enterprise security module
316
+ β”‚ β”œβ”€β”€ encryption.py # PBKDF2 + Fernet symmetric encryption
317
+ β”‚ β”œβ”€β”€ key_manager.py # Encrypted API key storage (~/.trusteval/keys.enc)
318
+ β”‚ β”œβ”€β”€ audit_logger.py # SHA256 hash-chain tamper-evident logging
319
+ β”‚ β”œβ”€β”€ input_sanitizer.py # 23 injection patterns, prompt length limits
320
+ β”‚ └── rate_limiter.py # Token bucket algorithm (60 RPM default)
321
+ β”œβ”€β”€ reporters/ # Report generation (PDF, JSON, CSV, HTML)
322
+ └── utils/ # Validators, helpers, constants
323
+
324
+ cli/ # Click + Rich CLI tool
325
+ dashboard/
326
+ β”œβ”€β”€ backend/ # FastAPI + async SQLAlchemy + WebSocket
327
+ └── frontend/ # React 18 + Vite + Tailwind CSS + Recharts
328
+
329
+ tests/
330
+ β”œβ”€β”€ unit/ # 157 unit tests
331
+ └── integration/ # 34 integration tests
332
+ ```
333
+
334
+ ---
335
+
336
+ ## πŸ“Š Full Example β€” Healthcare Evaluation
337
+
338
+ ```python
339
+ from trusteval import TrustEvaluator
340
+
341
+ # Configure evaluator for healthcare
342
+ evaluator = TrustEvaluator(
343
+ provider="openai",
344
+ model="gpt-4o",
345
+ industry="healthcare",
346
+ pillars=["bias", "hallucination", "pii", "toxicity"],
347
+ verbose=True
348
+ )
349
+
350
+ # Run full evaluation
351
+ result = evaluator.evaluate()
352
+
353
+ # Check results
354
+ print(f"Overall Score: {result.overall_score:.2f}")
355
+ print(f"Overall Grade: {result.overall_grade}")
356
+ print(f"Trust Level: {result.trust_level}")
357
+
358
+ # Per-pillar breakdown
359
+ for pillar_name, pillar in result.pillars.items():
360
+ print(f" {pillar_name}: {pillar.score:.2f} ({pillar.grade})"
361
+ f" - {pillar.pass_count}/{pillar.test_count} passed")
362
+
363
+ # Export compliance report
364
+ result.export("healthcare_gpt4o_audit.pdf")
365
+ result.export("healthcare_gpt4o_data.json", format="json")
366
+ result.export("healthcare_gpt4o_report.html", format="html")
367
+ ```
368
+
369
+ ### Compare Models Side-by-Side
370
+
371
+ ```python
372
+ evaluator_gpt = TrustEvaluator(provider="openai", model="gpt-4o", industry="healthcare")
373
+ evaluator_claude = TrustEvaluator(provider="anthropic", model="claude-3-opus-20240229", industry="healthcare")
374
+
375
+ comparison = evaluator_gpt.compare(evaluator_claude)
376
+ print(f"Winner: {comparison['winner']}")
377
+ print(f"GPT-4o Score: {comparison['results'][0]['overall_score']:.2f}")
378
+ print(f"Claude Score: {comparison['results'][1]['overall_score']:.2f}")
379
+ ```
380
+
381
+ ---
382
+
383
+ ## πŸ” Security
384
+
385
+ TrustEval is built with enterprise security requirements in mind:
386
+
387
+ | Feature | Implementation |
388
+ |---------|---------------|
389
+ | **API Key Encryption** | Fernet symmetric encryption with PBKDF2-HMAC-SHA256 key derivation |
390
+ | **Audit Logging** | SHA256 hash-chain with daily rotation (30-day retention) |
391
+ | **Input Sanitization** | 23 compiled injection patterns, 8000-char prompt limit |
392
+ | **Rate Limiting** | Token bucket algorithm, configurable RPM (default: 60) |
393
+ | **Prompt Injection Detection** | Pattern matching for DAN mode, jailbreaks, instruction overrides |
394
+ | **CORS Protection** | Configurable allowed origins for dashboard API |
395
+
396
+ ```python
397
+ from trusteval.security import KeyManager, InputSanitizer, AuditLogger
398
+
399
+ # Secure key storage
400
+ km = KeyManager()
401
+ km.store_key("openai", "sk-...")
402
+ key = km.get_key("openai")
403
+
404
+ # Input validation
405
+ sanitizer = InputSanitizer()
406
+ is_safe, cleaned = sanitizer.validate_prompt(user_input)
407
+
408
+ # Tamper-evident audit trail
409
+ logger = AuditLogger()
410
+ logger.log("evaluation_started", {"model": "gpt-4o", "industry": "healthcare"})
411
+ ```
412
+
413
+ ---
414
+
415
+ ## πŸ§ͺ Testing
416
+
417
+ TrustEval ships with **191 tests** covering all modules:
418
+
419
+ ```bash
420
+ # Run all tests
421
+ pytest tests/ -v
422
+
423
+ # Unit tests only
424
+ pytest tests/unit/ -v
425
+
426
+ # Integration tests only
427
+ pytest tests/integration/ -v
428
+
429
+ # With coverage
430
+ pytest tests/ --cov=trusteval --cov-report=html -v
431
+ ```
432
+
433
+ | Test Suite | Tests | Coverage |
434
+ |------------|-------|----------|
435
+ | Bias Detector | 22 | Stereotypes, counterfactual, demographic parity, gendered language |
436
+ | Hallucination Detector | 20 | Factual accuracy, hallucination rate, confidence, consistency |
437
+ | PII Detector | 23 | SSN, credit card, email, phone, IBAN, medical ID, IP address |
438
+ | Toxicity Detector | 20 | Hate speech, profanity, violence, jailbreak, category scoring |
439
+ | Evaluator | 12 | Init, pillar evaluation, comparison, error handling |
440
+ | Scorer | 22 | Grading, trust levels, weighted averages, edge cases |
441
+ | Security | 38 | Encryption, key management, sanitization, audit, rate limiting |
442
+ | OpenAI Provider | 9 | Generate, batch, rate limits, validation, cost estimation |
443
+ | Healthcare Benchmark | 17 | Prompts, compliance checks, coverage |
444
+ | Full Pipeline | 8 | End-to-end evaluation, export, comparison |
445
+
446
+ ---
447
+
448
+ ## βš™οΈ Configuration
449
+
450
+ ### Environment Variables
451
+
452
+ ```bash
453
+ # LLM Provider API Keys
454
+ export OPENAI_API_KEY="sk-..."
455
+ export ANTHROPIC_API_KEY="sk-ant-..."
456
+ export GOOGLE_API_KEY="..."
457
+ export HUGGINGFACE_API_KEY="hf_..."
458
+
459
+ # Dashboard
460
+ export TRUSTEVAL_DASHBOARD_KEY="your-secret-key"
461
+ export TRUSTEVAL_ALLOWED_ORIGINS="http://localhost:5173"
462
+ ```
463
+
464
+ ### Config File (~/.trusteval/config.yaml)
465
+
466
+ ```yaml
467
+ version: "1.0"
468
+ default_industry: healthcare
469
+ default_pillars:
470
+ - bias
471
+ - hallucination
472
+ - pii
473
+ - toxicity
474
+ evaluation:
475
+ timeout_seconds: 30
476
+ max_test_count: 100
477
+ ```
478
+
479
+ ---
480
+
481
+ ## πŸ“– Documentation
482
+
483
+ | Document | Description |
484
+ |----------|-------------|
485
+ | [Quick Start Guide](docs/quickstart.md) | Get up and running in 5 minutes |
486
+ | [SDK Reference](docs/sdk-reference.md) | Complete Python API documentation |
487
+ | [CLI Reference](docs/cli-reference.md) | All CLI commands and options |
488
+ | [Security Guide](docs/security.md) | Security architecture and best practices |
489
+ | [Industry Guides](docs/industries/) | Per-industry benchmark documentation |
490
+ | [Pillar Guides](docs/pillars/) | Deep-dive into each evaluation pillar |
491
+ | [Contributing](CONTRIBUTING.md) | How to contribute to TrustEval |
492
+ | [Changelog](CHANGELOG.md) | Version history and release notes |
493
+
494
+ ---
495
+
496
+ ## πŸ—ΊοΈ Roadmap
497
+
498
+ - [ ] **v1.1** β€” ML-based toxicity and bias detection (transformer models)
499
+ - [ ] **v1.2** β€” Additional industries (Manufacturing, Education, Government)
500
+ - [ ] **v1.3** β€” LLM-as-judge evaluation mode
501
+ - [ ] **v1.4** β€” Continuous monitoring and alerting
502
+ - [ ] **v2.0** β€” Multi-language support, EU AI Act compliance module
503
+
504
+ ---
505
+
506
+ ## 🀝 Contributing
507
+
508
+ We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
509
+
510
+ ```bash
511
+ # Clone and setup
512
+ git clone https://github.com/antrixsh/trusteval.git
513
+ cd trusteval
514
+ python -m venv .venv
515
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
516
+ pip install -e ".[dev]"
517
+
518
+ # Run tests
519
+ pytest tests/ -v
520
+
521
+ # Lint
522
+ ruff check trusteval/
523
+ ```
524
+
525
+ ---
526
+
527
+ ## πŸ‘€ Author
528
+
529
+ <table>
530
+ <tr>
531
+ <td>
532
+
533
+ **Antrixsh Gupta**
534
+
535
+ Enterprise AI & Data Science Leader | LinkedIn Top Voice in AI & Data Science
536
+
537
+ Senior Manager, Data & AI Practice @ Genzeon
538
+
539
+ - [LinkedIn](https://www.linkedin.com/in/antrixshgupta)
540
+ - [GitHub](https://github.com/antrixsh)
541
+
542
+ </td>
543
+ </tr>
544
+ </table>
545
+
546
+ > TrustEval was built to solve a real problem in enterprise AI: there was no single, industry-specific framework to evaluate whether an LLM is truly safe and reliable for regulated industries like Healthcare, BFSI, Retail, and Legal.
547
+
548
+ ---
549
+
550
+ ## πŸ“„ License
551
+
552
+ MIT License β€” see [LICENSE](LICENSE) for details.
553
+
554
+ ---
555
+
556
+ ## ⭐ Star History
557
+
558
+ If TrustEval helps your team deploy LLMs responsibly, please consider giving it a star!
559
+
560
+ <p align="center">
561
+ <a href="https://github.com/antrixsh/trusteval/stargazers">
562
+ <img src="https://img.shields.io/github/stars/antrixsh/trusteval?style=social" alt="GitHub Stars">
563
+ </a>
564
+ </p>
565
+
566
+ <p align="center"><em>"Don't just deploy AI. Trust it."</em></p>
567
+
568
+ ---
569
+
570
+ <p align="center">
571
+ <strong>Keywords:</strong> LLM evaluation framework, responsible AI, AI safety, bias detection, hallucination detection, PII detection, toxicity detection, healthcare AI, BFSI AI, legal AI compliance, HIPAA AI evaluation, GDPR AI compliance, enterprise LLM benchmarking, AI fairness, LLM auditing, OpenAI evaluation, Claude evaluation, Gemini evaluation, HuggingFace evaluation, AI trust scoring, responsible AI framework, LLM safety testing, AI bias testing, AI compliance automation
572
+ </p>