loopllm 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
loopllm/elicitation.py ADDED
@@ -0,0 +1,519 @@
1
+ """Bayesian intent elicitation layer.
2
+
3
+ Decomposes vague user prompts into structured specs through
4
+ information-gain-ranked clarifying questions. Learns which
5
+ questions are most valuable per task type over time.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import uuid
11
+ from dataclasses import dataclass, field
12
+ from typing import Any
13
+
14
+ import structlog
15
+
16
+ from loopllm.priors import AdaptivePriors, BetaPrior
17
+ from loopllm.provider import LLMProvider, LLMResponse
18
+
19
+ logger = structlog.get_logger(__name__)
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # Question taxonomy — each type carries its own prior
23
+ # ---------------------------------------------------------------------------
24
+
25
+ QUESTION_TYPES: list[str] = [
26
+ "scope", # What exactly should the output cover?
27
+ "format", # Desired output format / structure
28
+ "constraints", # Hard requirements, rules, boundaries
29
+ "examples", # Could you give an example of what you want?
30
+ "edge_cases", # How should corner cases be handled?
31
+ "audience", # Who is the target audience / consumer?
32
+ "priority", # What matters most if trade-offs are needed?
33
+ ]
34
+
35
+ # Default priors for question effectiveness (positive-skew: asking is
36
+ # usually somewhat helpful, but we're uncertain).
37
+ _DEFAULT_QUESTION_PRIORS: dict[str, BetaPrior] = {
38
+ "scope": BetaPrior(alpha=3.0, beta=1.5),
39
+ "format": BetaPrior(alpha=2.5, beta=1.5),
40
+ "constraints": BetaPrior(alpha=2.0, beta=1.5),
41
+ "examples": BetaPrior(alpha=2.5, beta=2.0),
42
+ "edge_cases": BetaPrior(alpha=1.5, beta=2.0),
43
+ "audience": BetaPrior(alpha=2.0, beta=2.5),
44
+ "priority": BetaPrior(alpha=2.0, beta=2.0),
45
+ }
46
+
47
+
48
+ # ---------------------------------------------------------------------------
49
+ # Data classes
50
+ # ---------------------------------------------------------------------------
51
+
52
+
53
+ @dataclass
54
+ class ClarifyingQuestion:
55
+ """A question to ask the user, ranked by expected information gain.
56
+
57
+ Attributes:
58
+ text: The question text to present.
59
+ question_type: Category from :data:`QUESTION_TYPES`.
60
+ options: Optional multiple-choice options.
61
+ information_gain: Expected information gain (higher = ask first).
62
+ prior: The Beta prior tracking this question type's effectiveness.
63
+ """
64
+
65
+ text: str
66
+ question_type: str
67
+ options: list[str] | None = None
68
+ information_gain: float = 0.0
69
+ prior: BetaPrior = field(default_factory=BetaPrior)
70
+
71
+
72
+ @dataclass
73
+ class IntentSpec:
74
+ """Structured specification produced by elicitation.
75
+
76
+ This is the refined, unambiguous description of what the user wants.
77
+ It feeds directly into the refinement loop and task orchestrator.
78
+
79
+ Attributes:
80
+ task_type: Classified task type (e.g. ``"code_generation"``).
81
+ original_prompt: The user's original input.
82
+ refined_prompt: The improved, expanded prompt.
83
+ constraints: Key constraints extracted from answers.
84
+ quality_criteria: How to judge the output.
85
+ decomposition_hints: Suggested subtask breakdown.
86
+ model_preference: Optional model preference.
87
+ estimated_complexity: Estimated difficulty (0.0–1.0).
88
+ context: Additional context from elicitation answers.
89
+ """
90
+
91
+ task_type: str = "general"
92
+ original_prompt: str = ""
93
+ refined_prompt: str = ""
94
+ constraints: dict[str, Any] = field(default_factory=dict)
95
+ quality_criteria: list[str] = field(default_factory=list)
96
+ decomposition_hints: list[str] = field(default_factory=list)
97
+ model_preference: str | None = None
98
+ estimated_complexity: float = 0.5
99
+ context: dict[str, Any] = field(default_factory=dict)
100
+
101
+
102
+ @dataclass
103
+ class ElicitationSession:
104
+ """Tracks the state of a Bayesian elicitation conversation.
105
+
106
+ Attributes:
107
+ session_id: Unique identifier.
108
+ original_prompt: The user's verbatim input.
109
+ questions_asked: Questions that have been posed so far.
110
+ answers: Mapping of question_type → user's answer.
111
+ refined_spec: The final spec (populated after :meth:`IntentRefiner.refine`).
112
+ task_type: Detected task type.
113
+ model_id: Model in use.
114
+ """
115
+
116
+ session_id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
117
+ original_prompt: str = ""
118
+ questions_asked: list[ClarifyingQuestion] = field(default_factory=list)
119
+ answers: dict[str, str] = field(default_factory=dict)
120
+ refined_spec: IntentSpec | None = None
121
+ task_type: str = "general"
122
+ model_id: str = ""
123
+
124
+
125
+ # ---------------------------------------------------------------------------
126
+ # Prompt templates
127
+ # ---------------------------------------------------------------------------
128
+
129
+ _ANALYZE_PROMPT = """\
130
+ You are an intent-analysis assistant. Given the user's prompt below,
131
+ identify what is AMBIGUOUS, MISSING, or ASSUMED.
132
+
133
+ For each gap you find, produce a JSON object with:
134
+ - "question_type": one of {question_types}
135
+ - "question": a concise clarifying question to ask the user
136
+ - "options": optional list of 2-4 suggested answers (or null)
137
+
138
+ Return a JSON array of these objects. Return at most 5 questions.
139
+ Order them by importance (most important first).
140
+
141
+ User prompt:
142
+ \"\"\"
143
+ {prompt}
144
+ \"\"\"
145
+
146
+ Respond ONLY with the JSON array, no other text.
147
+ """
148
+
149
+ _REFINE_PROMPT = """\
150
+ You are a prompt-engineering assistant. Given the original user prompt
151
+ and their answers to clarifying questions, produce a structured
152
+ specification.
153
+
154
+ Original prompt:
155
+ \"\"\"
156
+ {prompt}
157
+ \"\"\"
158
+
159
+ Clarifying Q&A:
160
+ {qa_text}
161
+
162
+ Produce a JSON object with these fields:
163
+ - "task_type": a short category label (e.g. "code_generation", "summarization", "data_extraction")
164
+ - "refined_prompt": an improved, unambiguous version of the user's prompt that incorporates all their answers
165
+ - "constraints": an object of key-value constraints extracted from answers
166
+ - "quality_criteria": a list of 2-5 criteria for judging output quality
167
+ - "decomposition_hints": a list of subtask descriptions if the task should be broken down (empty list if atomic)
168
+ - "estimated_complexity": a float 0.0-1.0 (0=trivial, 1=very complex)
169
+
170
+ Respond ONLY with the JSON object, no other text.
171
+ """
172
+
173
+ _CLASSIFY_PROMPT = """\
174
+ Classify the following user prompt into exactly one task type.
175
+ Choose from: code_generation, summarization, data_extraction, question_answering,
176
+ creative_writing, analysis, transformation, general.
177
+
178
+ User prompt: "{prompt}"
179
+
180
+ Respond with ONLY the task type label, nothing else.
181
+ """
182
+
183
+
184
+ # ---------------------------------------------------------------------------
185
+ # IntentRefiner
186
+ # ---------------------------------------------------------------------------
187
+
188
+
189
+ class IntentRefiner:
190
+ """Bayesian elicitation engine that turns vague prompts into structured specs.
191
+
192
+ Uses an LLM to generate clarifying questions, ranks them by expected
193
+ information gain (drawn from per-type Beta priors), and converges to
194
+ a structured :class:`IntentSpec` once marginal gain drops below a
195
+ learned threshold.
196
+
197
+ Args:
198
+ provider: LLM provider for meta-prompting.
199
+ priors: Adaptive priors for learning question effectiveness.
200
+ model: Default model to use for meta-prompts.
201
+ min_info_gain: Stop asking when the best remaining question has
202
+ information gain below this threshold.
203
+ max_questions: Hard cap on questions per session.
204
+ """
205
+
206
+ def __init__(
207
+ self,
208
+ provider: LLMProvider,
209
+ priors: AdaptivePriors | None = None,
210
+ model: str = "gpt-4o-mini",
211
+ min_info_gain: float = 0.15,
212
+ max_questions: int = 5,
213
+ ) -> None:
214
+ self.provider = provider
215
+ self.priors = priors or AdaptivePriors()
216
+ self.model = model
217
+ self.min_info_gain = min_info_gain
218
+ self.max_questions = max_questions
219
+ self._question_priors: dict[str, BetaPrior] = {
220
+ qt: BetaPrior(alpha=p.alpha, beta=p.beta)
221
+ for qt, p in _DEFAULT_QUESTION_PRIORS.items()
222
+ }
223
+
224
+ # -- question prior management -------------------------------------------
225
+
226
+ def _get_question_prior(self, question_type: str) -> BetaPrior:
227
+ """Get or create the effectiveness prior for a question type."""
228
+ if question_type not in self._question_priors:
229
+ self._question_priors[question_type] = BetaPrior(alpha=1.5, beta=1.5)
230
+ return self._question_priors[question_type]
231
+
232
+ def _compute_info_gain(self, question_type: str) -> float:
233
+ """Compute expected information gain for a question type.
234
+
235
+ Information gain = prior_mean * (1 - confidence).
236
+ High impact + high uncertainty = high gain (explore).
237
+ High impact + high confidence = moderate gain (exploit).
238
+ Low impact = low gain regardless.
239
+ """
240
+ prior = self._get_question_prior(question_type)
241
+ return prior.mean * (1.0 - prior.confidence)
242
+
243
+ # -- core API ------------------------------------------------------------
244
+
245
+ def classify_task(self, prompt: str) -> str:
246
+ """Classify a prompt into a task type using the LLM.
247
+
248
+ Args:
249
+ prompt: The user's original prompt.
250
+
251
+ Returns:
252
+ A task type string.
253
+ """
254
+ classify_prompt = _CLASSIFY_PROMPT.format(prompt=prompt)
255
+ response: LLMResponse = self.provider.complete(classify_prompt, self.model)
256
+ task_type = response.content.strip().lower().replace('"', "").replace("'", "")
257
+ # Validate against known types
258
+ known_types = {
259
+ "code_generation", "summarization", "data_extraction",
260
+ "question_answering", "creative_writing", "analysis",
261
+ "transformation", "general",
262
+ }
263
+ if task_type not in known_types:
264
+ task_type = "general"
265
+ return task_type
266
+
267
+ def analyze(self, prompt: str) -> list[ClarifyingQuestion]:
268
+ """Generate clarifying questions ranked by expected information gain.
269
+
270
+ Sends the prompt to the LLM with a meta-prompt asking it to
271
+ identify ambiguities, then ranks the resulting questions using
272
+ per-type Bayesian priors.
273
+
274
+ Args:
275
+ prompt: The user's original prompt.
276
+
277
+ Returns:
278
+ List of :class:`ClarifyingQuestion` objects, sorted by
279
+ information gain (highest first).
280
+ """
281
+ question_types_str = ", ".join(QUESTION_TYPES)
282
+ analyze_prompt = _ANALYZE_PROMPT.format(
283
+ prompt=prompt, question_types=question_types_str
284
+ )
285
+
286
+ response: LLMResponse = self.provider.complete(analyze_prompt, self.model)
287
+ raw = response.content.strip()
288
+
289
+ # Parse the JSON response
290
+ questions = self._parse_questions(raw)
291
+
292
+ # Score and sort by information gain
293
+ for q in questions:
294
+ q.prior = self._get_question_prior(q.question_type)
295
+ q.information_gain = self._compute_info_gain(q.question_type)
296
+
297
+ questions.sort(key=lambda q: q.information_gain, reverse=True)
298
+ return questions
299
+
300
+ def ask(self, session: ElicitationSession) -> ClarifyingQuestion | None:
301
+ """Return the next best question to ask, or ``None`` if enough info gathered.
302
+
303
+ Implements Bayesian stopping: stops when marginal info gain drops
304
+ below :attr:`min_info_gain` or :attr:`max_questions` is reached.
305
+
306
+ Args:
307
+ session: The current elicitation session.
308
+
309
+ Returns:
310
+ The next question to ask, or ``None`` to stop.
311
+ """
312
+ # Check hard cap
313
+ if len(session.questions_asked) >= self.max_questions:
314
+ return None
315
+
316
+ # Generate fresh questions if this is the first call
317
+ if not session.questions_asked:
318
+ candidates = self.analyze(session.original_prompt)
319
+ else:
320
+ # Re-analyze with context of previous answers
321
+ qa_context = "\n".join(
322
+ f"Q ({qt}): {q.text}\nA: {session.answers.get(qt, '(unanswered)')}"
323
+ for q in session.questions_asked
324
+ for qt in [q.question_type]
325
+ )
326
+ enriched = (
327
+ f"{session.original_prompt}\n\n"
328
+ f"Already answered:\n{qa_context}\n\n"
329
+ f"What else is still unclear or ambiguous?"
330
+ )
331
+ candidates = self.analyze(enriched)
332
+
333
+ # Filter out already-asked types
334
+ asked_types = {q.question_type for q in session.questions_asked}
335
+ candidates = [q for q in candidates if q.question_type not in asked_types]
336
+
337
+ if not candidates:
338
+ return None
339
+
340
+ best = candidates[0]
341
+ if best.information_gain < self.min_info_gain:
342
+ return None
343
+
344
+ return best
345
+
346
+ def refine(
347
+ self, prompt: str, answers: dict[str, str]
348
+ ) -> IntentSpec:
349
+ """Synthesize a structured :class:`IntentSpec` from prompt + answers.
350
+
351
+ Uses the LLM to combine the original prompt with all gathered
352
+ answers into a well-formed specification.
353
+
354
+ Args:
355
+ prompt: The user's original prompt.
356
+ answers: Dict mapping question_type → user answer.
357
+
358
+ Returns:
359
+ A structured :class:`IntentSpec`.
360
+ """
361
+ qa_text = "\n".join(
362
+ f"Q ({qt}): {ans}" for qt, ans in answers.items()
363
+ )
364
+ refine_prompt = _REFINE_PROMPT.format(prompt=prompt, qa_text=qa_text)
365
+
366
+ response: LLMResponse = self.provider.complete(refine_prompt, self.model)
367
+ raw = response.content.strip()
368
+
369
+ spec = self._parse_spec(raw, prompt)
370
+ return spec
371
+
372
+ def run_session(
373
+ self,
374
+ prompt: str,
375
+ answer_func: Any | None = None,
376
+ ) -> ElicitationSession:
377
+ """Run a complete elicitation session programmatically.
378
+
379
+ If *answer_func* is ``None``, gathers questions without answers
380
+ (useful for getting the question list). Otherwise, calls
381
+ ``answer_func(question)`` for each question to get the answer.
382
+
383
+ Args:
384
+ prompt: The user's original prompt.
385
+ answer_func: Optional callable ``(ClarifyingQuestion) -> str``.
386
+
387
+ Returns:
388
+ The completed :class:`ElicitationSession`.
389
+ """
390
+ session = ElicitationSession(original_prompt=prompt)
391
+ session.task_type = self.classify_task(prompt)
392
+
393
+ while True:
394
+ question = self.ask(session)
395
+ if question is None:
396
+ break
397
+
398
+ session.questions_asked.append(question)
399
+
400
+ if answer_func is not None:
401
+ answer = answer_func(question)
402
+ session.answers[question.question_type] = answer
403
+
404
+ if session.answers:
405
+ session.refined_spec = self.refine(prompt, session.answers)
406
+ else:
407
+ # No questions asked or no answers — create a minimal spec
408
+ session.refined_spec = IntentSpec(
409
+ task_type=session.task_type,
410
+ original_prompt=prompt,
411
+ refined_prompt=prompt,
412
+ )
413
+
414
+ return session
415
+
416
+ def observe_outcome(
417
+ self,
418
+ session: ElicitationSession,
419
+ final_score: float,
420
+ ) -> None:
421
+ """Update question priors based on the final outcome score.
422
+
423
+ Questions asked in sessions with high final scores get positive
424
+ updates; those in low-scoring sessions get negative updates.
425
+ The threshold is 0.7 (above = positive).
426
+
427
+ Args:
428
+ session: The completed elicitation session.
429
+ final_score: Final quality score of the task output.
430
+ """
431
+ success = final_score >= 0.7
432
+ for q in session.questions_asked:
433
+ prior = self._get_question_prior(q.question_type)
434
+ prior.update(success)
435
+ logger.debug(
436
+ "question_prior_updated",
437
+ question_type=q.question_type,
438
+ success=success,
439
+ new_mean=prior.mean,
440
+ )
441
+
442
+ # -- parsing helpers -----------------------------------------------------
443
+
444
+ def _parse_questions(self, raw: str) -> list[ClarifyingQuestion]:
445
+ """Parse LLM response into ClarifyingQuestion objects."""
446
+ # Try to extract JSON array from the response
447
+ raw = raw.strip()
448
+ if not raw.startswith("["):
449
+ # Try to find JSON array in the response
450
+ start = raw.find("[")
451
+ end = raw.rfind("]")
452
+ if start >= 0 and end > start:
453
+ raw = raw[start : end + 1]
454
+ else:
455
+ logger.warning("failed_to_parse_questions", raw=raw[:200])
456
+ return []
457
+
458
+ try:
459
+ items = json.loads(raw)
460
+ except json.JSONDecodeError:
461
+ logger.warning("json_parse_failed", raw=raw[:200])
462
+ return []
463
+
464
+ questions: list[ClarifyingQuestion] = []
465
+ for item in items:
466
+ if not isinstance(item, dict):
467
+ continue
468
+ qt = item.get("question_type", "general")
469
+ if qt not in QUESTION_TYPES:
470
+ qt = "scope" # Default to scope for unknown types
471
+ questions.append(
472
+ ClarifyingQuestion(
473
+ text=item.get("question", ""),
474
+ question_type=qt,
475
+ options=item.get("options"),
476
+ )
477
+ )
478
+ return questions
479
+
480
+ def _parse_spec(self, raw: str, original_prompt: str) -> IntentSpec:
481
+ """Parse LLM response into an IntentSpec."""
482
+ raw = raw.strip()
483
+ if not raw.startswith("{"):
484
+ start = raw.find("{")
485
+ end = raw.rfind("}")
486
+ if start >= 0 and end > start:
487
+ raw = raw[start : end + 1]
488
+ else:
489
+ # Fallback: treat entire response as refined prompt
490
+ return IntentSpec(
491
+ original_prompt=original_prompt,
492
+ refined_prompt=raw or original_prompt,
493
+ )
494
+
495
+ try:
496
+ data = json.loads(raw)
497
+ except json.JSONDecodeError:
498
+ return IntentSpec(
499
+ original_prompt=original_prompt,
500
+ refined_prompt=raw or original_prompt,
501
+ )
502
+
503
+ complexity = data.get("estimated_complexity", 0.5)
504
+ if isinstance(complexity, str):
505
+ try:
506
+ complexity = float(complexity)
507
+ except ValueError:
508
+ complexity = 0.5
509
+ complexity = max(0.0, min(1.0, complexity))
510
+
511
+ return IntentSpec(
512
+ task_type=data.get("task_type", "general"),
513
+ original_prompt=original_prompt,
514
+ refined_prompt=data.get("refined_prompt", original_prompt),
515
+ constraints=data.get("constraints", {}),
516
+ quality_criteria=data.get("quality_criteria", []),
517
+ decomposition_hints=data.get("decomposition_hints", []),
518
+ estimated_complexity=complexity,
519
+ )