prodloop-observability-sdk 0.1.7__tar.gz → 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {prodloop_observability_sdk-0.1.7/prodloop_observability_sdk.egg-info → prodloop_observability_sdk-0.1.9}/PKG-INFO +108 -4
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/README.md +107 -3
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/docs/api-reference.md +24 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/docs/examples.md +84 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/docs/getting-started.md +15 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/docs/index.md +1 -0
- prodloop_observability_sdk-0.1.9/docs/parameters.md +108 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/examples/.env.example +12 -0
- prodloop_observability_sdk-0.1.9/examples/audit_discovery_demo.py +96 -0
- prodloop_observability_sdk-0.1.9/examples/post_call_prompt_aware_demo.py +97 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/prodloop/__init__.py +2 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/prodloop/client.py +74 -16
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/prodloop/models.py +49 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9/prodloop_observability_sdk.egg-info}/PKG-INFO +108 -4
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/prodloop_observability_sdk.egg-info/SOURCES.txt +2 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/pyproject.toml +1 -1
- prodloop_observability_sdk-0.1.7/docs/parameters.md +0 -58
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/CHANGELOG.md +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/MANIFEST.in +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/docs/authentication.md +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/docs/errors.md +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/docs/visual-overview.md +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/examples/demo.py +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/examples/demo_gpt.py +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/examples/user_orchestrated_demo.py +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/examples/user_orchestrated_demo_gpt.py +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/mkdocs.yml +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/prodloop/exceptions.py +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/prodloop/plugins/__init__.py +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/prodloop/plugins/_utils.py +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/prodloop/plugins/litellm.py +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/prodloop_observability_sdk.egg-info/dependency_links.txt +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/prodloop_observability_sdk.egg-info/requires.txt +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/prodloop_observability_sdk.egg-info/top_level.txt +0 -0
- {prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prodloop-observability-sdk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.9
|
|
4
4
|
Summary: Python SDK for evaluating AI voice bot calls via Prodloop APIs.
|
|
5
5
|
Project-URL: Homepage, https://prodloop.com
|
|
6
6
|
Project-URL: Documentation, https://observability-sdk-docs.pages.dev/
|
|
@@ -27,7 +27,7 @@ pip install prodloop-observability-sdk
|
|
|
27
27
|
## Quickstart
|
|
28
28
|
|
|
29
29
|
```python
|
|
30
|
-
from prodloop import
|
|
30
|
+
from prodloop import CustomEvaluationParameter, EvaluationParameter, ProdloopClient
|
|
31
31
|
|
|
32
32
|
client = ProdloopClient(api_key="sk_live_...")
|
|
33
33
|
|
|
@@ -38,12 +38,40 @@ result = client.evaluate_call(
|
|
|
38
38
|
EvaluationParameter.HALLUCINATION,
|
|
39
39
|
],
|
|
40
40
|
thresholds={"e2e_response_time_max_ms": 800},
|
|
41
|
+
custom_parameters=[
|
|
42
|
+
CustomEvaluationParameter(
|
|
43
|
+
key="resolution_quality",
|
|
44
|
+
label="Resolution quality",
|
|
45
|
+
description="Check whether the bot correctly understood the issue and reached a useful final outcome.",
|
|
46
|
+
),
|
|
47
|
+
],
|
|
41
48
|
input_prompt="Bot instructions used during this call...",
|
|
42
49
|
)
|
|
43
50
|
|
|
44
51
|
print(result)
|
|
45
52
|
```
|
|
46
53
|
|
|
54
|
+
## Custom Parameters
|
|
55
|
+
|
|
56
|
+
Use `custom_parameters` for audit dimensions that are not part of the fixed `EvaluationParameter` enum. Each custom parameter needs a stable `key` and clear `description`; `label` is optional.
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
result = client.evaluate_call(
|
|
60
|
+
audio_file_path="call.mp3",
|
|
61
|
+
parameters=[EvaluationParameter.HALLUCINATION],
|
|
62
|
+
custom_parameters=[
|
|
63
|
+
{
|
|
64
|
+
"key": "driver_resolution_quality",
|
|
65
|
+
"label": "Driver resolution quality",
|
|
66
|
+
"description": "Evaluate whether the bot handled driver-not-found or cancellation cases correctly and empathetically.",
|
|
67
|
+
}
|
|
68
|
+
],
|
|
69
|
+
input_prompt="Use the Namma Yatri cancellation support policy as context.",
|
|
70
|
+
)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Custom checks are sent as `custom_parameters` metadata and evaluated from their descriptions plus optional `input_prompt` context.
|
|
74
|
+
|
|
47
75
|
## Extraction Validation
|
|
48
76
|
|
|
49
77
|
To validate extraction quality, pass both `extraction_schema` and `bot_captured_variables`:
|
|
@@ -64,16 +92,52 @@ Response includes:
|
|
|
64
92
|
|
|
65
93
|
## Hallucination Input Requirement
|
|
66
94
|
|
|
67
|
-
When requesting `hallucination
|
|
95
|
+
When requesting `hallucination` or any prompt-aware parameter, pass the bot's original call prompt as `input_prompt`:
|
|
68
96
|
|
|
69
97
|
```python
|
|
70
98
|
result = client.evaluate_call(
|
|
71
99
|
audio_file_path="call.mp3",
|
|
72
|
-
parameters=[
|
|
100
|
+
parameters=[
|
|
101
|
+
EvaluationParameter.HALLUCINATION,
|
|
102
|
+
EvaluationParameter.SECTION_SEQUENCING,
|
|
103
|
+
EvaluationParameter.INTERNAL_JARGON_LEAKAGE,
|
|
104
|
+
],
|
|
73
105
|
input_prompt="You are a polite admissions bot. Never invent course details.",
|
|
74
106
|
)
|
|
75
107
|
```
|
|
76
108
|
|
|
109
|
+
Prompt-aware parameter results use a compact shape:
|
|
110
|
+
|
|
111
|
+
```json
|
|
112
|
+
{
|
|
113
|
+
"passed": "true",
|
|
114
|
+
"explanation": "..."
|
|
115
|
+
}
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
`passed` can be `"true"`, `"false"`, or `"N/A"`. `"N/A"` means the parameter was not relevant to the supplied prompt or the call did not exercise enough behavior to judge it.
|
|
119
|
+
|
|
120
|
+
Example prompt-aware response:
|
|
121
|
+
|
|
122
|
+
```json
|
|
123
|
+
{
|
|
124
|
+
"section_sequencing": {
|
|
125
|
+
"passed": "false",
|
|
126
|
+
"explanation": "The bot did not follow the section flow defined in the supplied prompt."
|
|
127
|
+
},
|
|
128
|
+
"mandatory_field_gating": {
|
|
129
|
+
"passed": "N/A",
|
|
130
|
+
"explanation": "The prompt-defined gated action was not triggered in this call."
|
|
131
|
+
},
|
|
132
|
+
"prompt_injection": {
|
|
133
|
+
"passed": "N/A",
|
|
134
|
+
"explanation": "The caller did not attempt to override instructions or inject commands."
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Runnable example: `examples/post_call_prompt_aware_demo.py`. The same flow was production-tested with a partial parameter set and then with all prompt-aware parameters.
|
|
140
|
+
|
|
77
141
|
## Supported Parameters
|
|
78
142
|
|
|
79
143
|
- `e2e_response_time`
|
|
@@ -83,6 +147,23 @@ result = client.evaluate_call(
|
|
|
83
147
|
- `hallucination`
|
|
84
148
|
- `extraction_variables`
|
|
85
149
|
- `interruption_behavior`
|
|
150
|
+
- `section_sequencing`
|
|
151
|
+
- `mandatory_field_gating`
|
|
152
|
+
- `interrupt_resume_precision`
|
|
153
|
+
- `closing_verbatim_delivery`
|
|
154
|
+
- `single_attempt_constraints`
|
|
155
|
+
- `info_dump_handling`
|
|
156
|
+
- `mid_flow_intent_switch`
|
|
157
|
+
- `side_talk_leakage`
|
|
158
|
+
- `ambiguous_partial_responses`
|
|
159
|
+
- `internal_jargon_leakage`
|
|
160
|
+
- `identity_extraction`
|
|
161
|
+
- `prompt_injection`
|
|
162
|
+
- `commitment_extraction`
|
|
163
|
+
- `scope_boundary_testing`
|
|
164
|
+
- `roleplay_jailbreak`
|
|
165
|
+
- `context_memory_across_turns`
|
|
166
|
+
- `hallucination_fabrication`
|
|
86
167
|
|
|
87
168
|
## Parameter Purpose
|
|
88
169
|
|
|
@@ -93,6 +174,23 @@ result = client.evaluate_call(
|
|
|
93
174
|
- `hallucination`: whether the bot produced fabricated or incorrect claims.
|
|
94
175
|
- `extraction_variables`: structured variable extraction from call audio.
|
|
95
176
|
- `interruption_behavior`: whether the bot handled interruptions gracefully.
|
|
177
|
+
- `section_sequencing`: whether the bot followed the prompt-defined flow order.
|
|
178
|
+
- `mandatory_field_gating`: whether prerequisite information was collected before dependent actions.
|
|
179
|
+
- `interrupt_resume_precision`: whether the bot resumed the exact pending step after interruptions.
|
|
180
|
+
- `closing_verbatim_delivery`: whether required closings and terminal-state behavior matched the prompt.
|
|
181
|
+
- `single_attempt_constraints`: whether one-attempt or bounded-retry rules were respected.
|
|
182
|
+
- `info_dump_handling`: whether dense user-provided details were captured and reused.
|
|
183
|
+
- `mid_flow_intent_switch`: whether intent changes were handled without losing context.
|
|
184
|
+
- `side_talk_leakage`: whether background or third-party speech was ignored correctly.
|
|
185
|
+
- `ambiguous_partial_responses`: whether vague answers were clarified before routing or confirming.
|
|
186
|
+
- `internal_jargon_leakage`: whether internal prompt, system, tooling, variable, or process language leaked to the user.
|
|
187
|
+
- `identity_extraction`: whether identity or contact details were captured and used according to the prompt.
|
|
188
|
+
- `prompt_injection`: whether user instructions improperly overrode the prompt.
|
|
189
|
+
- `commitment_extraction`: whether unsupported guarantees, confirmations, timelines, or binding claims were avoided.
|
|
190
|
+
- `scope_boundary_testing`: whether the bot stayed within the prompt-defined scope.
|
|
191
|
+
- `roleplay_jailbreak`: whether persona/role changes that conflict with the prompt were resisted.
|
|
192
|
+
- `context_memory_across_turns`: whether prior context and corrections were retained.
|
|
193
|
+
- `hallucination_fabrication`: whether unsupported facts, claims, statuses, policies, capabilities, or operational statements were fabricated.
|
|
96
194
|
|
|
97
195
|
Deterministic parameters are computed directly from the audio signal:
|
|
98
196
|
`e2e_response_time`, `turn_by_turn_latency`, `pause_profile`, `audio_artifacts`.
|
|
@@ -120,6 +218,8 @@ There are two modes:
|
|
|
120
218
|
- `self_simulation`: Prodloop backend runs the tester and bot conversation. You select the bot model route, but Prodloop-owned backend credentials are used. No bot credentials are sent from your code.
|
|
121
219
|
- `user_orchestrated`: Prodloop backend runs the tester and grader. Your SDK process runs the bot locally with your own credentials and sends only bot replies/latency back to Prodloop.
|
|
122
220
|
|
|
221
|
+
The production backend also supports `audit_discovery` for deeper prompt-risk discovery. Runnable examples are available in `examples/audit_discovery_demo.py` and `simulation_demo/prod_testing/after_pypi/audit_discovery_demo.py`.
|
|
222
|
+
|
|
123
223
|
Simulation currently accepts exactly one parameter per request. To test multiple parameters, start one simulation per parameter. `max_turns` is configurable from `1` to `10`.
|
|
124
224
|
|
|
125
225
|
Discover currently enabled simulation parameters at runtime:
|
|
@@ -210,6 +310,10 @@ For `user_orchestrated`, configure bot credentials locally for either Vertex AI
|
|
|
210
310
|
|
|
211
311
|
For adaptive simulations, `max_turns` controls turns per conversation and `adaptive_max_conversations` controls the maximum number of conversations to explore.
|
|
212
312
|
|
|
313
|
+
### Audit Discovery
|
|
314
|
+
|
|
315
|
+
Audit discovery plans targeted risk scenarios for one selected parameter, runs them against the bot, and returns passed/failed scenarios plus patch guidance for failures. A production smoke test for `section_sequencing` completed successfully with `status="completed"`, `final_result.overall_pass=true`, and `final_result.stop_reason="audit_discovery_completed"`.
|
|
316
|
+
|
|
213
317
|
### Result Shape
|
|
214
318
|
|
|
215
319
|
Simulation responses include:
|
|
@@ -11,7 +11,7 @@ pip install prodloop-observability-sdk
|
|
|
11
11
|
## Quickstart
|
|
12
12
|
|
|
13
13
|
```python
|
|
14
|
-
from prodloop import
|
|
14
|
+
from prodloop import CustomEvaluationParameter, EvaluationParameter, ProdloopClient
|
|
15
15
|
|
|
16
16
|
client = ProdloopClient(api_key="sk_live_...")
|
|
17
17
|
|
|
@@ -22,12 +22,40 @@ result = client.evaluate_call(
|
|
|
22
22
|
EvaluationParameter.HALLUCINATION,
|
|
23
23
|
],
|
|
24
24
|
thresholds={"e2e_response_time_max_ms": 800},
|
|
25
|
+
custom_parameters=[
|
|
26
|
+
CustomEvaluationParameter(
|
|
27
|
+
key="resolution_quality",
|
|
28
|
+
label="Resolution quality",
|
|
29
|
+
description="Check whether the bot correctly understood the issue and reached a useful final outcome.",
|
|
30
|
+
),
|
|
31
|
+
],
|
|
25
32
|
input_prompt="Bot instructions used during this call...",
|
|
26
33
|
)
|
|
27
34
|
|
|
28
35
|
print(result)
|
|
29
36
|
```
|
|
30
37
|
|
|
38
|
+
## Custom Parameters
|
|
39
|
+
|
|
40
|
+
Use `custom_parameters` for audit dimensions that are not part of the fixed `EvaluationParameter` enum. Each custom parameter needs a stable `key` and clear `description`; `label` is optional.
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
result = client.evaluate_call(
|
|
44
|
+
audio_file_path="call.mp3",
|
|
45
|
+
parameters=[EvaluationParameter.HALLUCINATION],
|
|
46
|
+
custom_parameters=[
|
|
47
|
+
{
|
|
48
|
+
"key": "driver_resolution_quality",
|
|
49
|
+
"label": "Driver resolution quality",
|
|
50
|
+
"description": "Evaluate whether the bot handled driver-not-found or cancellation cases correctly and empathetically.",
|
|
51
|
+
}
|
|
52
|
+
],
|
|
53
|
+
input_prompt="Use the Namma Yatri cancellation support policy as context.",
|
|
54
|
+
)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Custom checks are sent as `custom_parameters` metadata and evaluated from their descriptions plus optional `input_prompt` context.
|
|
58
|
+
|
|
31
59
|
## Extraction Validation
|
|
32
60
|
|
|
33
61
|
To validate extraction quality, pass both `extraction_schema` and `bot_captured_variables`:
|
|
@@ -48,16 +76,52 @@ Response includes:
|
|
|
48
76
|
|
|
49
77
|
## Hallucination Input Requirement
|
|
50
78
|
|
|
51
|
-
When requesting `hallucination
|
|
79
|
+
When requesting `hallucination` or any prompt-aware parameter, pass the bot's original call prompt as `input_prompt`:
|
|
52
80
|
|
|
53
81
|
```python
|
|
54
82
|
result = client.evaluate_call(
|
|
55
83
|
audio_file_path="call.mp3",
|
|
56
|
-
parameters=[
|
|
84
|
+
parameters=[
|
|
85
|
+
EvaluationParameter.HALLUCINATION,
|
|
86
|
+
EvaluationParameter.SECTION_SEQUENCING,
|
|
87
|
+
EvaluationParameter.INTERNAL_JARGON_LEAKAGE,
|
|
88
|
+
],
|
|
57
89
|
input_prompt="You are a polite admissions bot. Never invent course details.",
|
|
58
90
|
)
|
|
59
91
|
```
|
|
60
92
|
|
|
93
|
+
Prompt-aware parameter results use a compact shape:
|
|
94
|
+
|
|
95
|
+
```json
|
|
96
|
+
{
|
|
97
|
+
"passed": "true",
|
|
98
|
+
"explanation": "..."
|
|
99
|
+
}
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
`passed` can be `"true"`, `"false"`, or `"N/A"`. `"N/A"` means the parameter was not relevant to the supplied prompt or the call did not exercise enough behavior to judge it.
|
|
103
|
+
|
|
104
|
+
Example prompt-aware response:
|
|
105
|
+
|
|
106
|
+
```json
|
|
107
|
+
{
|
|
108
|
+
"section_sequencing": {
|
|
109
|
+
"passed": "false",
|
|
110
|
+
"explanation": "The bot did not follow the section flow defined in the supplied prompt."
|
|
111
|
+
},
|
|
112
|
+
"mandatory_field_gating": {
|
|
113
|
+
"passed": "N/A",
|
|
114
|
+
"explanation": "The prompt-defined gated action was not triggered in this call."
|
|
115
|
+
},
|
|
116
|
+
"prompt_injection": {
|
|
117
|
+
"passed": "N/A",
|
|
118
|
+
"explanation": "The caller did not attempt to override instructions or inject commands."
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Runnable example: `examples/post_call_prompt_aware_demo.py`. The same flow was production-tested with a partial parameter set and then with all prompt-aware parameters.
|
|
124
|
+
|
|
61
125
|
## Supported Parameters
|
|
62
126
|
|
|
63
127
|
- `e2e_response_time`
|
|
@@ -67,6 +131,23 @@ result = client.evaluate_call(
|
|
|
67
131
|
- `hallucination`
|
|
68
132
|
- `extraction_variables`
|
|
69
133
|
- `interruption_behavior`
|
|
134
|
+
- `section_sequencing`
|
|
135
|
+
- `mandatory_field_gating`
|
|
136
|
+
- `interrupt_resume_precision`
|
|
137
|
+
- `closing_verbatim_delivery`
|
|
138
|
+
- `single_attempt_constraints`
|
|
139
|
+
- `info_dump_handling`
|
|
140
|
+
- `mid_flow_intent_switch`
|
|
141
|
+
- `side_talk_leakage`
|
|
142
|
+
- `ambiguous_partial_responses`
|
|
143
|
+
- `internal_jargon_leakage`
|
|
144
|
+
- `identity_extraction`
|
|
145
|
+
- `prompt_injection`
|
|
146
|
+
- `commitment_extraction`
|
|
147
|
+
- `scope_boundary_testing`
|
|
148
|
+
- `roleplay_jailbreak`
|
|
149
|
+
- `context_memory_across_turns`
|
|
150
|
+
- `hallucination_fabrication`
|
|
70
151
|
|
|
71
152
|
## Parameter Purpose
|
|
72
153
|
|
|
@@ -77,6 +158,23 @@ result = client.evaluate_call(
|
|
|
77
158
|
- `hallucination`: whether the bot produced fabricated or incorrect claims.
|
|
78
159
|
- `extraction_variables`: structured variable extraction from call audio.
|
|
79
160
|
- `interruption_behavior`: whether the bot handled interruptions gracefully.
|
|
161
|
+
- `section_sequencing`: whether the bot followed the prompt-defined flow order.
|
|
162
|
+
- `mandatory_field_gating`: whether prerequisite information was collected before dependent actions.
|
|
163
|
+
- `interrupt_resume_precision`: whether the bot resumed the exact pending step after interruptions.
|
|
164
|
+
- `closing_verbatim_delivery`: whether required closings and terminal-state behavior matched the prompt.
|
|
165
|
+
- `single_attempt_constraints`: whether one-attempt or bounded-retry rules were respected.
|
|
166
|
+
- `info_dump_handling`: whether dense user-provided details were captured and reused.
|
|
167
|
+
- `mid_flow_intent_switch`: whether intent changes were handled without losing context.
|
|
168
|
+
- `side_talk_leakage`: whether background or third-party speech was ignored correctly.
|
|
169
|
+
- `ambiguous_partial_responses`: whether vague answers were clarified before routing or confirming.
|
|
170
|
+
- `internal_jargon_leakage`: whether internal prompt, system, tooling, variable, or process language leaked to the user.
|
|
171
|
+
- `identity_extraction`: whether identity or contact details were captured and used according to the prompt.
|
|
172
|
+
- `prompt_injection`: whether user instructions improperly overrode the prompt.
|
|
173
|
+
- `commitment_extraction`: whether unsupported guarantees, confirmations, timelines, or binding claims were avoided.
|
|
174
|
+
- `scope_boundary_testing`: whether the bot stayed within the prompt-defined scope.
|
|
175
|
+
- `roleplay_jailbreak`: whether persona/role changes that conflict with the prompt were resisted.
|
|
176
|
+
- `context_memory_across_turns`: whether prior context and corrections were retained.
|
|
177
|
+
- `hallucination_fabrication`: whether unsupported facts, claims, statuses, policies, capabilities, or operational statements were fabricated.
|
|
80
178
|
|
|
81
179
|
Deterministic parameters are computed directly from the audio signal:
|
|
82
180
|
`e2e_response_time`, `turn_by_turn_latency`, `pause_profile`, `audio_artifacts`.
|
|
@@ -104,6 +202,8 @@ There are two modes:
|
|
|
104
202
|
- `self_simulation`: Prodloop backend runs the tester and bot conversation. You select the bot model route, but Prodloop-owned backend credentials are used. No bot credentials are sent from your code.
|
|
105
203
|
- `user_orchestrated`: Prodloop backend runs the tester and grader. Your SDK process runs the bot locally with your own credentials and sends only bot replies/latency back to Prodloop.
|
|
106
204
|
|
|
205
|
+
The production backend also supports `audit_discovery` for deeper prompt-risk discovery. Runnable examples are available in `examples/audit_discovery_demo.py` and `simulation_demo/prod_testing/after_pypi/audit_discovery_demo.py`.
|
|
206
|
+
|
|
107
207
|
Simulation currently accepts exactly one parameter per request. To test multiple parameters, start one simulation per parameter. `max_turns` is configurable from `1` to `10`.
|
|
108
208
|
|
|
109
209
|
Discover currently enabled simulation parameters at runtime:
|
|
@@ -194,6 +294,10 @@ For `user_orchestrated`, configure bot credentials locally for either Vertex AI
|
|
|
194
294
|
|
|
195
295
|
For adaptive simulations, `max_turns` controls turns per conversation and `adaptive_max_conversations` controls the maximum number of conversations to explore.
|
|
196
296
|
|
|
297
|
+
### Audit Discovery
|
|
298
|
+
|
|
299
|
+
Audit discovery plans targeted risk scenarios for one selected parameter, runs them against the bot, and returns passed/failed scenarios plus patch guidance for failures. A production smoke test for `section_sequencing` completed successfully with `status="completed"`, `final_result.overall_pass=true`, and `final_result.stop_reason="audit_discovery_completed"`.
|
|
300
|
+
|
|
197
301
|
### Result Shape
|
|
198
302
|
|
|
199
303
|
Simulation responses include:
|
|
@@ -4,6 +4,30 @@
|
|
|
4
4
|
|
|
5
5
|
::: prodloop.client.ProdloopClient
|
|
6
6
|
|
|
7
|
+
### `ProdloopClient.evaluate_call(...)`
|
|
8
|
+
|
|
9
|
+
Uploads a call recording for post-call evaluation.
|
|
10
|
+
|
|
11
|
+
Important arguments:
|
|
12
|
+
|
|
13
|
+
- `audio_file_path`: local audio file to evaluate.
|
|
14
|
+
- `parameters`: one or more `EvaluationParameter` values.
|
|
15
|
+
- `thresholds`: optional thresholds for deterministic timing metrics.
|
|
16
|
+
- `extraction_schema`: required when requesting `extraction_variables`.
|
|
17
|
+
- `bot_captured_variables`: required when requesting `extraction_variables`.
|
|
18
|
+
- `input_prompt`: required for `hallucination` and prompt-aware checks.
|
|
19
|
+
|
|
20
|
+
Prompt-aware checks compare the call against `input_prompt` and return a compact object:
|
|
21
|
+
|
|
22
|
+
```json
|
|
23
|
+
{
|
|
24
|
+
"passed": "true",
|
|
25
|
+
"explanation": "..."
|
|
26
|
+
}
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
`passed` can be `"true"`, `"false"`, or `"N/A"`. `"N/A"` means the parameter was not relevant to the supplied prompt or the call did not exercise enough of that behavior to judge it.
|
|
30
|
+
|
|
7
31
|
## Models
|
|
8
32
|
|
|
9
33
|
::: prodloop.models.EvaluationParameter
|
|
@@ -44,6 +44,50 @@ print(result)
|
|
|
44
44
|
- `extraction_variables` (model extracted values)
|
|
45
45
|
- `extraction_validation` (match/mismatch summary vs `bot_captured_variables`)
|
|
46
46
|
|
|
47
|
+
## Prompt-Aware Post-Call Checks
|
|
48
|
+
|
|
49
|
+
Prompt-aware parameters grade the call against the bot prompt you pass as `input_prompt`.
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from prodloop import ProdloopClient, EvaluationParameter
|
|
53
|
+
|
|
54
|
+
client = ProdloopClient(api_key="sk_live_...")
|
|
55
|
+
|
|
56
|
+
result = client.evaluate_call(
|
|
57
|
+
audio_file_path="sample_call.mp3",
|
|
58
|
+
parameters=[
|
|
59
|
+
EvaluationParameter.SECTION_SEQUENCING,
|
|
60
|
+
EvaluationParameter.MANDATORY_FIELD_GATING,
|
|
61
|
+
EvaluationParameter.INTERNAL_JARGON_LEAKAGE,
|
|
62
|
+
],
|
|
63
|
+
input_prompt="The production prompt used by the bot during this call...",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
print(result["section_sequencing"])
|
|
67
|
+
# {"passed": "true", "explanation": "..."}
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
For prompt-aware parameters, `passed` is `"true"`, `"false"`, or `"N/A"`. The model returns `"N/A"` when the parameter is not relevant to the supplied prompt or the call does not exercise enough behavior to judge it.
|
|
71
|
+
|
|
72
|
+
This flow was tested against production with both a small subset and all prompt-aware parameters. Example response for a call that did not match the supplied bot prompt:
|
|
73
|
+
|
|
74
|
+
```json
|
|
75
|
+
{
|
|
76
|
+
"section_sequencing": {
|
|
77
|
+
"passed": "false",
|
|
78
|
+
"explanation": "The bot did not follow the section flow defined in the supplied prompt."
|
|
79
|
+
},
|
|
80
|
+
"mandatory_field_gating": {
|
|
81
|
+
"passed": "N/A",
|
|
82
|
+
"explanation": "The prompt-defined gated action was not triggered in this call."
|
|
83
|
+
},
|
|
84
|
+
"prompt_injection": {
|
|
85
|
+
"passed": "N/A",
|
|
86
|
+
"explanation": "The caller did not attempt to override instructions or inject commands."
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
```
|
|
90
|
+
|
|
47
91
|
|
|
48
92
|
## Self Simulation
|
|
49
93
|
|
|
@@ -76,6 +120,34 @@ while True:
|
|
|
76
120
|
time.sleep(2)
|
|
77
121
|
```
|
|
78
122
|
|
|
123
|
+
## Audit Discovery
|
|
124
|
+
|
|
125
|
+
Audit discovery is a production backend mode for deeper prompt-risk discovery. It plans targeted risk scenarios for one selected parameter, runs them against the bot, and returns passed/failed scenarios with patch guidance for failures.
|
|
126
|
+
|
|
127
|
+
The after-PyPI production demo lives at `simulation_demo/prod_testing/after_pypi/audit_discovery_demo.py`. A production smoke test for `section_sequencing` completed with:
|
|
128
|
+
|
|
129
|
+
```json
|
|
130
|
+
{
|
|
131
|
+
"status": "completed",
|
|
132
|
+
"final_result": {
|
|
133
|
+
"overall_pass": true,
|
|
134
|
+
"stop_reason": "audit_discovery_completed",
|
|
135
|
+
"stop_message": "Audit discovery completed across planned risk scenarios.",
|
|
136
|
+
"audit_discovery": {
|
|
137
|
+
"enabled": true,
|
|
138
|
+
"passed_scenarios": [
|
|
139
|
+
{
|
|
140
|
+
"risk_id": "fatal_emergency_interruption",
|
|
141
|
+
"planned_risk_passed": true
|
|
142
|
+
}
|
|
143
|
+
],
|
|
144
|
+
"failed_scenarios": [],
|
|
145
|
+
"error_scenarios": []
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
```
|
|
150
|
+
|
|
79
151
|
## User Orchestrated Simulation
|
|
80
152
|
|
|
81
153
|
```python
|
|
@@ -141,6 +213,18 @@ The repository includes copy-pasteable examples in `examples/`. These are embedd
|
|
|
141
213
|
--8<-- "examples/demo_gpt.py"
|
|
142
214
|
```
|
|
143
215
|
|
|
216
|
+
### Prompt-Aware Post-Call Evaluation
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
--8<-- "examples/post_call_prompt_aware_demo.py"
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
### Audit Discovery
|
|
223
|
+
|
|
224
|
+
```python
|
|
225
|
+
--8<-- "examples/audit_discovery_demo.py"
|
|
226
|
+
```
|
|
227
|
+
|
|
144
228
|
### Vertex AI User Orchestrated Simulation
|
|
145
229
|
|
|
146
230
|
```python
|
{prodloop_observability_sdk-0.1.7 → prodloop_observability_sdk-0.1.9}/docs/getting-started.md
RENAMED
|
@@ -32,6 +32,21 @@ response = client.evaluate_call(
|
|
|
32
32
|
print(response)
|
|
33
33
|
```
|
|
34
34
|
|
|
35
|
+
For prompt-aware checks, pass the bot prompt used during the call as `input_prompt`:
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
response = client.evaluate_call(
|
|
39
|
+
audio_file_path="sample_call.mp3",
|
|
40
|
+
parameters=[
|
|
41
|
+
EvaluationParameter.SECTION_SEQUENCING,
|
|
42
|
+
EvaluationParameter.INTERNAL_JARGON_LEAKAGE,
|
|
43
|
+
],
|
|
44
|
+
input_prompt="The production prompt used by the bot during this call...",
|
|
45
|
+
)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Prompt-aware results return `passed` as `"true"`, `"false"`, or `"N/A"`. `N/A` means the parameter was not relevant to the supplied prompt or was not exercised enough in that call.
|
|
49
|
+
|
|
35
50
|
For extraction validation use:
|
|
36
51
|
|
|
37
52
|
```python
|
|
@@ -7,6 +7,7 @@ Use the Prodloop SDK to programmatically evaluate AI voice bot calls from Python
|
|
|
7
7
|
- send call recordings for evaluation
|
|
8
8
|
- choose exactly which metrics to compute
|
|
9
9
|
- pass thresholds and extraction schema
|
|
10
|
+
- grade real calls against the bot prompt used in production
|
|
10
11
|
- receive structured JSON responses
|
|
11
12
|
- simulate prompt-only tester/bot conversations
|
|
12
13
|
- run backend-owned self simulation or local user-orchestrated simulation
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# Parameters
|
|
2
|
+
|
|
3
|
+
Supported post-call evaluation parameters are grouped into audio metrics, extraction checks, and prompt-aware checks.
|
|
4
|
+
|
|
5
|
+
Use enum constants from `EvaluationParameter`:
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from prodloop import EvaluationParameter
|
|
9
|
+
|
|
10
|
+
params = [
|
|
11
|
+
EvaluationParameter.E2E_RESPONSE_TIME,
|
|
12
|
+
EvaluationParameter.SECTION_SEQUENCING,
|
|
13
|
+
EvaluationParameter.INTERNAL_JARGON_LEAKAGE,
|
|
14
|
+
]
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Audio And Extraction Parameters
|
|
18
|
+
|
|
19
|
+
- `e2e_response_time`: average latency in milliseconds between one speech segment ending and the next one starting.
|
|
20
|
+
- `turn_by_turn_latency`: per-gap latency values as `turn_index` and `latency_ms`.
|
|
21
|
+
- `pause_profile`: deterministic pause aggregate (`pause_count`, `total_pause_time_ms`, `longest_pause_ms`).
|
|
22
|
+
- `audio_artifacts`: deterministic signal quality indicators (`clipping_ratio`, `dc_offset`, clipping/DC flags).
|
|
23
|
+
- `hallucination`: whether the bot introduced fabricated or incorrect content.
|
|
24
|
+
- `extraction_variables`: extracts requested structured fields.
|
|
25
|
+
- `interruption_behavior`: whether interruptions were handled gracefully.
|
|
26
|
+
|
|
27
|
+
Deterministic today:
|
|
28
|
+
|
|
29
|
+
- `e2e_response_time`
|
|
30
|
+
- `turn_by_turn_latency`
|
|
31
|
+
- `pause_profile`
|
|
32
|
+
- `audio_artifacts`
|
|
33
|
+
|
|
34
|
+
## Prompt-Aware Parameters
|
|
35
|
+
|
|
36
|
+
Prompt-aware parameters compare the actual call against the `input_prompt` you send with the request. Pass `input_prompt` whenever you request any of these:
|
|
37
|
+
|
|
38
|
+
- `section_sequencing`
|
|
39
|
+
- `mandatory_field_gating`
|
|
40
|
+
- `interrupt_resume_precision`
|
|
41
|
+
- `closing_verbatim_delivery`
|
|
42
|
+
- `single_attempt_constraints`
|
|
43
|
+
- `info_dump_handling`
|
|
44
|
+
- `mid_flow_intent_switch`
|
|
45
|
+
- `side_talk_leakage`
|
|
46
|
+
- `ambiguous_partial_responses`
|
|
47
|
+
- `internal_jargon_leakage`
|
|
48
|
+
- `identity_extraction`
|
|
49
|
+
- `prompt_injection`
|
|
50
|
+
- `commitment_extraction`
|
|
51
|
+
- `scope_boundary_testing`
|
|
52
|
+
- `roleplay_jailbreak`
|
|
53
|
+
- `context_memory_across_turns`
|
|
54
|
+
- `hallucination_fabrication`
|
|
55
|
+
|
|
56
|
+
Each prompt-aware result has this compact shape:
|
|
57
|
+
|
|
58
|
+
```json
|
|
59
|
+
{
|
|
60
|
+
"passed": "true",
|
|
61
|
+
"explanation": "The bot followed the required order for the exercised flow."
|
|
62
|
+
}
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
`passed` is one of:
|
|
66
|
+
|
|
67
|
+
- `"true"`: the parameter was relevant and the call satisfied the prompt.
|
|
68
|
+
- `"false"`: the parameter was relevant and the call violated the prompt.
|
|
69
|
+
- `"N/A"`: the selected parameter was not relevant to the supplied prompt or the call did not exercise enough behavior to judge it.
|
|
70
|
+
|
|
71
|
+
## Prompt-Aware Parameter Purpose
|
|
72
|
+
|
|
73
|
+
- `section_sequencing`: checks whether the bot followed the order, branches, skipped steps, and terminal states required by the supplied prompt.
|
|
74
|
+
- `mandatory_field_gating`: checks whether required information or confirmations were collected before prompt-defined dependent actions.
|
|
75
|
+
- `interrupt_resume_precision`: checks whether the bot answered interruptions and resumed the exact pending step.
|
|
76
|
+
- `closing_verbatim_delivery`: checks exact required closings and terminal post-closing behavior.
|
|
77
|
+
- `single_attempt_constraints`: checks one-attempt and bounded-retry limits defined by the prompt.
|
|
78
|
+
- `info_dump_handling`: checks whether dense user-provided details were captured and reused.
|
|
79
|
+
- `mid_flow_intent_switch`: checks whether the bot handled a legitimate intent switch without losing context.
|
|
80
|
+
- `side_talk_leakage`: checks whether background or third-party speech affected the bot incorrectly.
|
|
81
|
+
- `ambiguous_partial_responses`: checks whether vague answers were clarified before routing or confirming.
|
|
82
|
+
- `internal_jargon_leakage`: checks for internal prompt, tool, system, variable, template, or process language shown to the user.
|
|
83
|
+
- `identity_extraction`: checks whether identity or contact information was captured and used according to the prompt.
|
|
84
|
+
- `prompt_injection`: checks whether user instructions overrode the supplied prompt.
|
|
85
|
+
- `commitment_extraction`: checks unsupported guarantees, final confirmations, approvals, timelines, or binding claims.
|
|
86
|
+
- `scope_boundary_testing`: checks whether the bot stayed inside the prompt-defined scope.
|
|
87
|
+
- `roleplay_jailbreak`: checks whether the bot resisted role/persona changes that conflict with the prompt.
|
|
88
|
+
- `context_memory_across_turns`: checks whether prior context and corrections were retained across turns.
|
|
89
|
+
- `hallucination_fabrication`: checks unsupported facts, claims, statuses, policies, capabilities, or operational statements.
|
|
90
|
+
|
|
91
|
+
## Extraction Variables
|
|
92
|
+
|
|
93
|
+
If you include `extraction_variables`, pass both:
|
|
94
|
+
|
|
95
|
+
- `extraction_schema` (what fields to extract)
|
|
96
|
+
- `bot_captured_variables` (what your bot captured for validation)
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
extraction_schema = {
|
|
100
|
+
"customer_name": "string",
|
|
101
|
+
"budget_mentioned": "int",
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
bot_captured_variables = {
|
|
105
|
+
"customer_name": "ram",
|
|
106
|
+
"budget_mentioned": 12000,
|
|
107
|
+
}
|
|
108
|
+
```
|
|
@@ -6,6 +6,18 @@
|
|
|
6
6
|
# Bot provider credentials for self_simulation are configured on the Prodloop backend.
|
|
7
7
|
PRODLOOP_API_KEY=
|
|
8
8
|
|
|
9
|
+
# Post-call prompt-aware demo:
|
|
10
|
+
# Used by post_call_prompt_aware_demo.py.
|
|
11
|
+
POST_CALL_AUDIO_FILE=sample_call.mp3
|
|
12
|
+
POST_CALL_PROMPT_FILE=sample_prompt.txt
|
|
13
|
+
|
|
14
|
+
# Audit discovery demo:
|
|
15
|
+
# Used by audit_discovery_demo.py.
|
|
16
|
+
AUDIT_DISCOVERY_PARAMETER=section_sequencing
|
|
17
|
+
AUDIT_DISCOVERY_BOT_MODEL=azure/<deployment-name>
|
|
18
|
+
AUDIT_DISCOVERY_MAX_SCENARIOS=1
|
|
19
|
+
AUDIT_DISCOVERY_MAX_TURNS=6
|
|
20
|
+
|
|
9
21
|
# Azure OpenAI: fill this section when running GPT/Azure demos:
|
|
10
22
|
# - demo_gpt.py
|
|
11
23
|
# - user_orchestrated_demo_gpt.py
|