frootai 3.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- frootai-3.3.0/PKG-INFO +204 -0
- frootai-3.3.0/README.md +182 -0
- frootai-3.3.0/frootai/__init__.py +27 -0
- frootai-3.3.0/frootai/ab_testing.py +162 -0
- frootai-3.3.0/frootai/cli.py +139 -0
- frootai-3.3.0/frootai/client.py +214 -0
- frootai-3.3.0/frootai/evaluation.py +69 -0
- frootai-3.3.0/frootai/knowledge.json +1 -0
- frootai-3.3.0/frootai/plays.py +74 -0
- frootai-3.3.0/frootai.egg-info/PKG-INFO +204 -0
- frootai-3.3.0/frootai.egg-info/SOURCES.txt +14 -0
- frootai-3.3.0/frootai.egg-info/dependency_links.txt +1 -0
- frootai-3.3.0/frootai.egg-info/entry_points.txt +2 -0
- frootai-3.3.0/frootai.egg-info/top_level.txt +1 -0
- frootai-3.3.0/pyproject.toml +40 -0
- frootai-3.3.0/setup.cfg +4 -0
frootai-3.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: frootai
|
|
3
|
+
Version: 3.3.0
|
|
4
|
+
Summary: FrootAI SDK — The open glue for AI architecture. Offline access to 16 knowledge modules, 20 solution plays, cost estimation, evaluation, and A/B testing.
|
|
5
|
+
Author-email: Pavleen Bali <pavleenbali@frootai.dev>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://frootai.dev
|
|
8
|
+
Project-URL: Repository, https://github.com/gitpavleenbali/frootai
|
|
9
|
+
Project-URL: Documentation, https://frootai.dev/api-docs
|
|
10
|
+
Keywords: frootai,ai,architecture,azure,mcp,agents,rag,sdk
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# FrootAI — Python SDK
|
|
24
|
+
|
|
25
|
+
> Offline-first access to 16 AI architecture knowledge modules, 20 solution plays, cost estimation, evaluation, and A/B testing. Zero external dependencies.
|
|
26
|
+
|
|
27
|
+
[](https://pypi.org/project/frootai/)
|
|
28
|
+
[](https://pypi.org/project/frootai/)
|
|
29
|
+
[](https://opensource.org/licenses/MIT)
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install frootai
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from frootai import FrootAI, SolutionPlay, Evaluator
|
|
41
|
+
|
|
42
|
+
# Search 682KB knowledge base (16 modules across 5 FROOT layers)
|
|
43
|
+
client = FrootAI()
|
|
44
|
+
results = client.search("RAG architecture")
|
|
45
|
+
for r in results:
|
|
46
|
+
print(f"[{r['module_id']}] {r['title']} — {r['relevance']} hits")
|
|
47
|
+
|
|
48
|
+
# Get a specific module
|
|
49
|
+
module = client.get_module("R2") # RAG Architecture & Retrieval
|
|
50
|
+
print(f"{module['title']}: {module['content_length']:,} chars")
|
|
51
|
+
|
|
52
|
+
# List all FROOT layers
|
|
53
|
+
for layer in client.list_layers():
|
|
54
|
+
print(f"{layer['emoji']} {layer['name']} ({len(layer['modules'])} modules)")
|
|
55
|
+
|
|
56
|
+
# Estimate Azure costs
|
|
57
|
+
cost = client.estimate_cost("01-enterprise-rag", scale="prod")
|
|
58
|
+
print(f"${cost['monthly_total']}/mo")
|
|
59
|
+
|
|
60
|
+
# Browse 20 solution plays
|
|
61
|
+
plays = SolutionPlay.all()
|
|
62
|
+
ready = SolutionPlay.ready() # 3 production-ready
|
|
63
|
+
by_layer = SolutionPlay.by_layer("R") # Reasoning layer plays
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Features
|
|
67
|
+
|
|
68
|
+
### Knowledge Search (offline, no API calls)
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
client = FrootAI()
|
|
72
|
+
|
|
73
|
+
# Full-text search across 16 modules (643KB of real content)
|
|
74
|
+
results = client.search("embeddings", max_results=5)
|
|
75
|
+
|
|
76
|
+
# Get module by ID
|
|
77
|
+
mod = client.get_module("O2") # AI Agents & Microsoft Agent Framework
|
|
78
|
+
|
|
79
|
+
# List all modules
|
|
80
|
+
for m in client.list_modules():
|
|
81
|
+
print(f"{m['emoji']} {m['id']} {m['title']} ({m['content_length'] // 1024}KB)")
|
|
82
|
+
|
|
83
|
+
# Extract a specific section
|
|
84
|
+
section = client.get_module_section("F1", "Table of Contents")
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Glossary (159+ terms extracted from content)
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
# Look up a term
|
|
91
|
+
term = client.lookup_term("temperature")
|
|
92
|
+
|
|
93
|
+
# Search glossary
|
|
94
|
+
terms = client.search_glossary("embedding", max_results=10)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Cost Estimation
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
# Estimate monthly Azure costs for a solution play
|
|
101
|
+
cost = client.estimate_cost("01-enterprise-rag", scale="dev")
|
|
102
|
+
# {'play': '01-enterprise-rag', 'scale': 'dev', 'monthly_total': 430, 'breakdown': {...}}
|
|
103
|
+
|
|
104
|
+
cost = client.estimate_cost("01-enterprise-rag", scale="prod")
|
|
105
|
+
# {'monthly_total': 3600, 'breakdown': {'openai-gpt4o': 2500, 'ai-search-standard': 750, ...}}
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Solution Plays (20 pre-tuned architecture blueprints)
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from frootai.plays import SolutionPlay
|
|
112
|
+
|
|
113
|
+
play = SolutionPlay.get("03")
|
|
114
|
+
print(f"{play.name}: {play.description}")
|
|
115
|
+
print(f"Infrastructure: {play.infra}")
|
|
116
|
+
print(f"Tuning params: {play.tuning}")
|
|
117
|
+
print(f"Related modules: {play.modules}")
|
|
118
|
+
|
|
119
|
+
# Filter by FROOT layer
|
|
120
|
+
orchestration_plays = SolutionPlay.by_layer("O_ORCH")
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Evaluation (quality gates)
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from frootai import Evaluator
|
|
127
|
+
|
|
128
|
+
evaluator = Evaluator()
|
|
129
|
+
scores = {"groundedness": 4.5, "relevance": 3.2, "coherence": 4.1, "fluency": 4.8}
|
|
130
|
+
|
|
131
|
+
results = evaluator.check_thresholds(scores)
|
|
132
|
+
print(evaluator.summary(scores))
|
|
133
|
+
# 3/4 checks passed (relevance 3.2 < threshold 4.0)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### A/B Testing (prompt experiments)
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
from frootai.ab_testing import PromptExperiment, PromptVariant
|
|
140
|
+
|
|
141
|
+
# You provide the model function — no fake scores
|
|
142
|
+
def my_model(system_prompt, query):
|
|
143
|
+
return call_your_llm(system_prompt=system_prompt, user_message=query)
|
|
144
|
+
|
|
145
|
+
def my_scorer(query, response):
|
|
146
|
+
return {"groundedness": 4.5, "relevance": 4.0}
|
|
147
|
+
|
|
148
|
+
experiment = PromptExperiment(
|
|
149
|
+
name="system-prompt-v2",
|
|
150
|
+
variants=[
|
|
151
|
+
PromptVariant("control", "You are a helpful assistant."),
|
|
152
|
+
PromptVariant("expert", "You are an Azure AI expert. Cite sources."),
|
|
153
|
+
],
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
results = experiment.run(["What is RAG?"], model_fn=my_model, scorer_fn=my_scorer)
|
|
157
|
+
print(f"Winner: {experiment.pick_winner(results)}")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## CLI
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
frootai plays # List all 20 solution plays
|
|
164
|
+
frootai plays --ready # Show production-ready plays only
|
|
165
|
+
frootai plays --layer R # Filter by FROOT layer
|
|
166
|
+
frootai search "embeddings" # Search knowledge base
|
|
167
|
+
frootai modules # List all 16 modules with sizes
|
|
168
|
+
frootai glossary temperature # Look up a term
|
|
169
|
+
frootai cost 01-enterprise-rag # Estimate Azure costs
|
|
170
|
+
frootai cost 01-enterprise-rag --scale prod
|
|
171
|
+
frootai --version # Show version
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## What's Inside
|
|
175
|
+
|
|
176
|
+
- **16 knowledge modules** (643KB) across 5 FROOT layers: Foundations, Reasoning, Orchestration, Operations, Transformation
|
|
177
|
+
- **20 solution plays** with infrastructure, tuning parameters, and module mapping
|
|
178
|
+
- **159+ glossary terms** extracted from module content
|
|
179
|
+
- **Cost estimation** for 10 plays with dev/prod breakdowns
|
|
180
|
+
- **Evaluation framework** with configurable thresholds
|
|
181
|
+
- **A/B testing framework** with real model callbacks (no fake scores)
|
|
182
|
+
- **Zero external dependencies** — pure Python stdlib
|
|
183
|
+
|
|
184
|
+
## FROOT Layers
|
|
185
|
+
|
|
186
|
+
| Layer | Emoji | Name | Modules |
|
|
187
|
+
|-------|-------|------|---------|
|
|
188
|
+
| F | 🌱 | Foundations | F1-F4 (GenAI, LLMs, Glossary, Agentic OS) |
|
|
189
|
+
| R | 🪵 | Reasoning | R1-R3 (Prompts, RAG, Deterministic AI) |
|
|
190
|
+
| O_ORCH | 🌿 | Orchestration | O1-O3 (Semantic Kernel, Agents, MCP) |
|
|
191
|
+
| O_OPS | 🏗️ | Operations | O4-O6 (Platform, Infrastructure, Copilot) |
|
|
192
|
+
| T | 🍎 | Transformation | T1-T3 (Fine-Tuning, Responsible AI, Production) |
|
|
193
|
+
|
|
194
|
+
## Links
|
|
195
|
+
|
|
196
|
+
- **Website:** [frootai.dev](https://frootai.dev)
|
|
197
|
+
- **npm MCP Server:** [frootai-mcp](https://www.npmjs.com/package/frootai-mcp)
|
|
198
|
+
- **VS Code Extension:** [pavleenbali.frootai](https://marketplace.visualstudio.com/items?itemName=pavleenbali.frootai)
|
|
199
|
+
- **GitHub:** [github.com/gitpavleenbali/frootai](https://github.com/gitpavleenbali/frootai)
|
|
200
|
+
- **Python MCP Server:** [frootai-mcp (PyPI)](https://pypi.org/project/frootai-mcp/)
|
|
201
|
+
|
|
202
|
+
## License
|
|
203
|
+
|
|
204
|
+
MIT — Pavleen Bali
|
frootai-3.3.0/README.md
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# FrootAI — Python SDK
|
|
2
|
+
|
|
3
|
+
> Offline-first access to 16 AI architecture knowledge modules, 20 solution plays, cost estimation, evaluation, and A/B testing. Zero external dependencies.
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/frootai/)
|
|
6
|
+
[](https://pypi.org/project/frootai/)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install frootai
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from frootai import FrootAI, SolutionPlay, Evaluator
|
|
19
|
+
|
|
20
|
+
# Search 682KB knowledge base (16 modules across 5 FROOT layers)
|
|
21
|
+
client = FrootAI()
|
|
22
|
+
results = client.search("RAG architecture")
|
|
23
|
+
for r in results:
|
|
24
|
+
print(f"[{r['module_id']}] {r['title']} — {r['relevance']} hits")
|
|
25
|
+
|
|
26
|
+
# Get a specific module
|
|
27
|
+
module = client.get_module("R2") # RAG Architecture & Retrieval
|
|
28
|
+
print(f"{module['title']}: {module['content_length']:,} chars")
|
|
29
|
+
|
|
30
|
+
# List all FROOT layers
|
|
31
|
+
for layer in client.list_layers():
|
|
32
|
+
print(f"{layer['emoji']} {layer['name']} ({len(layer['modules'])} modules)")
|
|
33
|
+
|
|
34
|
+
# Estimate Azure costs
|
|
35
|
+
cost = client.estimate_cost("01-enterprise-rag", scale="prod")
|
|
36
|
+
print(f"${cost['monthly_total']}/mo")
|
|
37
|
+
|
|
38
|
+
# Browse 20 solution plays
|
|
39
|
+
plays = SolutionPlay.all()
|
|
40
|
+
ready = SolutionPlay.ready() # 3 production-ready
|
|
41
|
+
by_layer = SolutionPlay.by_layer("R") # Reasoning layer plays
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Features
|
|
45
|
+
|
|
46
|
+
### Knowledge Search (offline, no API calls)
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
client = FrootAI()
|
|
50
|
+
|
|
51
|
+
# Full-text search across 16 modules (643KB of real content)
|
|
52
|
+
results = client.search("embeddings", max_results=5)
|
|
53
|
+
|
|
54
|
+
# Get module by ID
|
|
55
|
+
mod = client.get_module("O2") # AI Agents & Microsoft Agent Framework
|
|
56
|
+
|
|
57
|
+
# List all modules
|
|
58
|
+
for m in client.list_modules():
|
|
59
|
+
print(f"{m['emoji']} {m['id']} {m['title']} ({m['content_length'] // 1024}KB)")
|
|
60
|
+
|
|
61
|
+
# Extract a specific section
|
|
62
|
+
section = client.get_module_section("F1", "Table of Contents")
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Glossary (159+ terms extracted from content)
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
# Look up a term
|
|
69
|
+
term = client.lookup_term("temperature")
|
|
70
|
+
|
|
71
|
+
# Search glossary
|
|
72
|
+
terms = client.search_glossary("embedding", max_results=10)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Cost Estimation
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
# Estimate monthly Azure costs for a solution play
|
|
79
|
+
cost = client.estimate_cost("01-enterprise-rag", scale="dev")
|
|
80
|
+
# {'play': '01-enterprise-rag', 'scale': 'dev', 'monthly_total': 430, 'breakdown': {...}}
|
|
81
|
+
|
|
82
|
+
cost = client.estimate_cost("01-enterprise-rag", scale="prod")
|
|
83
|
+
# {'monthly_total': 3600, 'breakdown': {'openai-gpt4o': 2500, 'ai-search-standard': 750, ...}}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Solution Plays (20 pre-tuned architecture blueprints)
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from frootai.plays import SolutionPlay
|
|
90
|
+
|
|
91
|
+
play = SolutionPlay.get("03")
|
|
92
|
+
print(f"{play.name}: {play.description}")
|
|
93
|
+
print(f"Infrastructure: {play.infra}")
|
|
94
|
+
print(f"Tuning params: {play.tuning}")
|
|
95
|
+
print(f"Related modules: {play.modules}")
|
|
96
|
+
|
|
97
|
+
# Filter by FROOT layer
|
|
98
|
+
orchestration_plays = SolutionPlay.by_layer("O_ORCH")
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Evaluation (quality gates)
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from frootai import Evaluator
|
|
105
|
+
|
|
106
|
+
evaluator = Evaluator()
|
|
107
|
+
scores = {"groundedness": 4.5, "relevance": 3.2, "coherence": 4.1, "fluency": 4.8}
|
|
108
|
+
|
|
109
|
+
results = evaluator.check_thresholds(scores)
|
|
110
|
+
print(evaluator.summary(scores))
|
|
111
|
+
# 3/4 checks passed (relevance 3.2 < threshold 4.0)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### A/B Testing (prompt experiments)
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from frootai.ab_testing import PromptExperiment, PromptVariant
|
|
118
|
+
|
|
119
|
+
# You provide the model function — no fake scores
|
|
120
|
+
def my_model(system_prompt, query):
|
|
121
|
+
return call_your_llm(system_prompt=system_prompt, user_message=query)
|
|
122
|
+
|
|
123
|
+
def my_scorer(query, response):
|
|
124
|
+
return {"groundedness": 4.5, "relevance": 4.0}
|
|
125
|
+
|
|
126
|
+
experiment = PromptExperiment(
|
|
127
|
+
name="system-prompt-v2",
|
|
128
|
+
variants=[
|
|
129
|
+
PromptVariant("control", "You are a helpful assistant."),
|
|
130
|
+
PromptVariant("expert", "You are an Azure AI expert. Cite sources."),
|
|
131
|
+
],
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
results = experiment.run(["What is RAG?"], model_fn=my_model, scorer_fn=my_scorer)
|
|
135
|
+
print(f"Winner: {experiment.pick_winner(results)}")
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## CLI
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
frootai plays # List all 20 solution plays
|
|
142
|
+
frootai plays --ready # Show production-ready plays only
|
|
143
|
+
frootai plays --layer R # Filter by FROOT layer
|
|
144
|
+
frootai search "embeddings" # Search knowledge base
|
|
145
|
+
frootai modules # List all 16 modules with sizes
|
|
146
|
+
frootai glossary temperature # Look up a term
|
|
147
|
+
frootai cost 01-enterprise-rag # Estimate Azure costs
|
|
148
|
+
frootai cost 01-enterprise-rag --scale prod
|
|
149
|
+
frootai --version # Show version
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## What's Inside
|
|
153
|
+
|
|
154
|
+
- **16 knowledge modules** (643KB) across 5 FROOT layers: Foundations, Reasoning, Orchestration, Operations, Transformation
|
|
155
|
+
- **20 solution plays** with infrastructure, tuning parameters, and module mapping
|
|
156
|
+
- **159+ glossary terms** extracted from module content
|
|
157
|
+
- **Cost estimation** for 10 plays with dev/prod breakdowns
|
|
158
|
+
- **Evaluation framework** with configurable thresholds
|
|
159
|
+
- **A/B testing framework** with real model callbacks (no fake scores)
|
|
160
|
+
- **Zero external dependencies** — pure Python stdlib
|
|
161
|
+
|
|
162
|
+
## FROOT Layers
|
|
163
|
+
|
|
164
|
+
| Layer | Emoji | Name | Modules |
|
|
165
|
+
|-------|-------|------|---------|
|
|
166
|
+
| F | 🌱 | Foundations | F1-F4 (GenAI, LLMs, Glossary, Agentic OS) |
|
|
167
|
+
| R | 🪵 | Reasoning | R1-R3 (Prompts, RAG, Deterministic AI) |
|
|
168
|
+
| O_ORCH | 🌿 | Orchestration | O1-O3 (Semantic Kernel, Agents, MCP) |
|
|
169
|
+
| O_OPS | 🏗️ | Operations | O4-O6 (Platform, Infrastructure, Copilot) |
|
|
170
|
+
| T | 🍎 | Transformation | T1-T3 (Fine-Tuning, Responsible AI, Production) |
|
|
171
|
+
|
|
172
|
+
## Links
|
|
173
|
+
|
|
174
|
+
- **Website:** [frootai.dev](https://frootai.dev)
|
|
175
|
+
- **npm MCP Server:** [frootai-mcp](https://www.npmjs.com/package/frootai-mcp)
|
|
176
|
+
- **VS Code Extension:** [pavleenbali.frootai](https://marketplace.visualstudio.com/items?itemName=pavleenbali.frootai)
|
|
177
|
+
- **GitHub:** [github.com/gitpavleenbali/frootai](https://github.com/gitpavleenbali/frootai)
|
|
178
|
+
- **Python MCP Server:** [frootai-mcp (PyPI)](https://pypi.org/project/frootai-mcp/)
|
|
179
|
+
|
|
180
|
+
## License
|
|
181
|
+
|
|
182
|
+
MIT — Pavleen Bali
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""FrootAI SDK — Programmatic access to the FrootAI ecosystem.
|
|
2
|
+
|
|
3
|
+
From the Roots to the Fruits. It's simply Frootful.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
from frootai import FrootAI, SolutionPlay, Evaluator
|
|
7
|
+
|
|
8
|
+
client = FrootAI()
|
|
9
|
+
results = client.search("RAG architecture")
|
|
10
|
+
module = client.get_module("R2")
|
|
11
|
+
cost = client.estimate_cost("01-enterprise-rag", scale="dev")
|
|
12
|
+
|
|
13
|
+
plays = SolutionPlay.all()
|
|
14
|
+
play = SolutionPlay.get("03")
|
|
15
|
+
|
|
16
|
+
evaluator = Evaluator()
|
|
17
|
+
evaluator.check_thresholds({"groundedness": 4.2, "relevance": 3.8})
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
__version__ = "3.3.0"
|
|
21
|
+
__author__ = "Pavleen Bali"
|
|
22
|
+
|
|
23
|
+
from frootai.client import FrootAI
|
|
24
|
+
from frootai.plays import SolutionPlay
|
|
25
|
+
from frootai.evaluation import Evaluator
|
|
26
|
+
|
|
27
|
+
__all__ = ["FrootAI", "SolutionPlay", "Evaluator", "__version__"]
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""FrootAI Prompt A/B Testing Framework.
|
|
2
|
+
|
|
3
|
+
Run prompt experiments across variants, measure quality, pick winners.
|
|
4
|
+
Requires a model_fn callback for actual LLM inference.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from frootai.ab_testing import PromptExperiment, PromptVariant
|
|
8
|
+
|
|
9
|
+
def my_model(system_prompt: str, query: str) -> str:
|
|
10
|
+
# Call Azure OpenAI, local model, etc.
|
|
11
|
+
return openai_client.chat(system_prompt=system_prompt, query=query)
|
|
12
|
+
|
|
13
|
+
def my_scorer(query: str, response: str) -> dict[str, float]:
|
|
14
|
+
return {"groundedness": 4.5, "relevance": 4.0}
|
|
15
|
+
|
|
16
|
+
experiment = PromptExperiment(
|
|
17
|
+
name="rag-system-prompt-v2",
|
|
18
|
+
variants=[
|
|
19
|
+
PromptVariant("control", "You are a helpful assistant."),
|
|
20
|
+
PromptVariant("concise", "You are a concise assistant. Answer in 2 sentences max."),
|
|
21
|
+
PromptVariant("expert", "You are an Azure AI expert. Cite sources."),
|
|
22
|
+
],
|
|
23
|
+
metrics=["groundedness", "relevance", "latency"],
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
results = experiment.run(
|
|
27
|
+
test_queries=["What is RAG?", "Explain embeddings"],
|
|
28
|
+
model_fn=my_model,
|
|
29
|
+
scorer_fn=my_scorer,
|
|
30
|
+
)
|
|
31
|
+
winner = experiment.pick_winner(results)
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from dataclasses import dataclass, field
|
|
35
|
+
from typing import Optional, Callable
|
|
36
|
+
import json
|
|
37
|
+
import time
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class PromptVariant:
|
|
42
|
+
"""A single prompt variant in an A/B test."""
|
|
43
|
+
name: str
|
|
44
|
+
system_prompt: str
|
|
45
|
+
weight: float = 1.0
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class ExperimentResult:
|
|
50
|
+
"""Result of running one variant against one query."""
|
|
51
|
+
variant: str
|
|
52
|
+
query: str
|
|
53
|
+
response: str
|
|
54
|
+
latency_ms: float
|
|
55
|
+
scores: dict[str, float] = field(default_factory=dict)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class PromptExperiment:
|
|
60
|
+
"""A/B testing experiment for prompt variants.
|
|
61
|
+
|
|
62
|
+
Attributes:
|
|
63
|
+
name: Experiment identifier
|
|
64
|
+
variants: List of prompt variants to test
|
|
65
|
+
metrics: Quality metrics to measure
|
|
66
|
+
"""
|
|
67
|
+
name: str
|
|
68
|
+
variants: list[PromptVariant]
|
|
69
|
+
metrics: list[str] = field(default_factory=lambda: ["groundedness", "relevance", "coherence"])
|
|
70
|
+
|
|
71
|
+
def run(
|
|
72
|
+
self,
|
|
73
|
+
test_queries: list[str],
|
|
74
|
+
model_fn: Callable[[str, str], str],
|
|
75
|
+
scorer_fn: Optional[Callable[[str, str], dict[str, float]]] = None,
|
|
76
|
+
rounds: int = 1,
|
|
77
|
+
) -> list[ExperimentResult]:
|
|
78
|
+
"""Run the experiment using provided model and scorer functions.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
test_queries: Questions to test each variant against.
|
|
82
|
+
model_fn: Callable(system_prompt, query) -> response string.
|
|
83
|
+
scorer_fn: Optional Callable(query, response) -> {metric: score}.
|
|
84
|
+
If not provided, only latency is measured.
|
|
85
|
+
rounds: Number of rounds to repeat (for statistical stability).
|
|
86
|
+
"""
|
|
87
|
+
results = []
|
|
88
|
+
for _ in range(rounds):
|
|
89
|
+
for query in test_queries:
|
|
90
|
+
for variant in self.variants:
|
|
91
|
+
start = time.perf_counter()
|
|
92
|
+
response = model_fn(variant.system_prompt, query)
|
|
93
|
+
latency = (time.perf_counter() - start) * 1000
|
|
94
|
+
|
|
95
|
+
scores = {}
|
|
96
|
+
if scorer_fn is not None:
|
|
97
|
+
scores = scorer_fn(query, response)
|
|
98
|
+
scores["latency_ms"] = round(latency, 1)
|
|
99
|
+
|
|
100
|
+
result = ExperimentResult(
|
|
101
|
+
variant=variant.name,
|
|
102
|
+
query=query,
|
|
103
|
+
response=response,
|
|
104
|
+
latency_ms=round(latency, 1),
|
|
105
|
+
scores=scores,
|
|
106
|
+
)
|
|
107
|
+
results.append(result)
|
|
108
|
+
return results
|
|
109
|
+
|
|
110
|
+
def pick_winner(self, results: list[ExperimentResult]) -> str:
|
|
111
|
+
"""Pick the best variant based on average scores (excluding latency)."""
|
|
112
|
+
variant_scores: dict[str, list[float]] = {}
|
|
113
|
+
for r in results:
|
|
114
|
+
if r.variant not in variant_scores:
|
|
115
|
+
variant_scores[r.variant] = []
|
|
116
|
+
quality_scores = {k: v for k, v in r.scores.items() if k != "latency_ms"}
|
|
117
|
+
if quality_scores:
|
|
118
|
+
avg = sum(quality_scores.values()) / len(quality_scores)
|
|
119
|
+
variant_scores[r.variant].append(avg)
|
|
120
|
+
|
|
121
|
+
if not variant_scores or all(len(v) == 0 for v in variant_scores.values()):
|
|
122
|
+
# Fall back to lowest latency if no quality scores
|
|
123
|
+
latencies: dict[str, list[float]] = {}
|
|
124
|
+
for r in results:
|
|
125
|
+
latencies.setdefault(r.variant, []).append(r.latency_ms)
|
|
126
|
+
return min(latencies, key=lambda v: sum(latencies[v]) / len(latencies[v]))
|
|
127
|
+
|
|
128
|
+
averages = {v: sum(s) / len(s) for v, s in variant_scores.items() if s}
|
|
129
|
+
return max(averages, key=averages.get)
|
|
130
|
+
|
|
131
|
+
def summary(self, results: list[ExperimentResult]) -> str:
|
|
132
|
+
"""Generate experiment summary."""
|
|
133
|
+
lines = [f"Experiment: {self.name}", "=" * 50]
|
|
134
|
+
variant_data: dict[str, list] = {}
|
|
135
|
+
for r in results:
|
|
136
|
+
variant_data.setdefault(r.variant, []).append(r)
|
|
137
|
+
|
|
138
|
+
for variant, data in variant_data.items():
|
|
139
|
+
avg_scores: dict[str, float] = {}
|
|
140
|
+
all_metrics = set()
|
|
141
|
+
for r in data:
|
|
142
|
+
all_metrics.update(r.scores.keys())
|
|
143
|
+
for m in sorted(all_metrics):
|
|
144
|
+
vals = [r.scores[m] for r in data if m in r.scores]
|
|
145
|
+
if vals:
|
|
146
|
+
avg_scores[m] = sum(vals) / len(vals)
|
|
147
|
+
lines.append(f"\n Variant: {variant}")
|
|
148
|
+
lines.append(f" Samples: {len(data)}")
|
|
149
|
+
for m, s in avg_scores.items():
|
|
150
|
+
lines.append(f" {m}: {s:.2f}")
|
|
151
|
+
|
|
152
|
+
winner = self.pick_winner(results)
|
|
153
|
+
lines.append(f"\n Winner: {winner}")
|
|
154
|
+
return "\n".join(lines)
|
|
155
|
+
|
|
156
|
+
def to_json(self) -> str:
|
|
157
|
+
"""Export experiment config as JSON."""
|
|
158
|
+
return json.dumps({
|
|
159
|
+
"name": self.name,
|
|
160
|
+
"variants": [{"name": v.name, "system_prompt": v.system_prompt, "weight": v.weight} for v in self.variants],
|
|
161
|
+
"metrics": self.metrics,
|
|
162
|
+
}, indent=2)
|