mirage-benchmark 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mirage-benchmark might be problematic. Click here for more details.
- mirage/__init__.py +83 -0
- mirage/cli.py +150 -0
- mirage/core/__init__.py +52 -0
- mirage/core/config.py +248 -0
- mirage/core/llm.py +1745 -0
- mirage/core/prompts.py +884 -0
- mirage/embeddings/__init__.py +31 -0
- mirage/embeddings/models.py +512 -0
- mirage/embeddings/rerankers_multimodal.py +766 -0
- mirage/embeddings/rerankers_text.py +149 -0
- mirage/evaluation/__init__.py +26 -0
- mirage/evaluation/metrics.py +2223 -0
- mirage/evaluation/metrics_optimized.py +2172 -0
- mirage/pipeline/__init__.py +45 -0
- mirage/pipeline/chunker.py +545 -0
- mirage/pipeline/context.py +1003 -0
- mirage/pipeline/deduplication.py +491 -0
- mirage/pipeline/domain.py +514 -0
- mirage/pipeline/pdf_processor.py +598 -0
- mirage/pipeline/qa_generator.py +798 -0
- mirage/utils/__init__.py +31 -0
- mirage/utils/ablation.py +360 -0
- mirage/utils/preflight.py +663 -0
- mirage/utils/stats.py +626 -0
- mirage_benchmark-1.0.4.dist-info/METADATA +490 -0
- mirage_benchmark-1.0.4.dist-info/RECORD +30 -0
- mirage_benchmark-1.0.4.dist-info/WHEEL +5 -0
- mirage_benchmark-1.0.4.dist-info/entry_points.txt +3 -0
- mirage_benchmark-1.0.4.dist-info/licenses/LICENSE +190 -0
- mirage_benchmark-1.0.4.dist-info/top_level.txt +1 -0
mirage/core/prompts.py
ADDED
|
@@ -0,0 +1,884 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Multimodal RAG Evaluation Dataset Generation Prompts
|
|
3
|
+
=====================================================
|
|
4
|
+
|
|
5
|
+
A collection of prompts for generating high-quality Question-Answer datasets
|
|
6
|
+
from multimodal technical documents. Designed for RAG system evaluation with
|
|
7
|
+
support for text, tables, figures, and images.
|
|
8
|
+
|
|
9
|
+
This module provides prompts for:
|
|
10
|
+
- Document semantic chunking
|
|
11
|
+
- Chunk completeness verification
|
|
12
|
+
- QA pair generation and verification
|
|
13
|
+
- Deduplication and merging
|
|
14
|
+
- Retrieval metrics evaluation (faithfulness, precision, recall)
|
|
15
|
+
- Multimodal content handling
|
|
16
|
+
|
|
17
|
+
Usage:
|
|
18
|
+
from prompt import PROMPTS, PROMPTS_DESC, PROMPTS_CHUNK, PROMPTS_METRICS
|
|
19
|
+
|
|
20
|
+
Author: [Your Name/Organization]
|
|
21
|
+
License: Apache 2.0
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
# =============================================================================
|
|
28
|
+
# CONFIGURATION
|
|
29
|
+
# =============================================================================
|
|
30
|
+
|
|
31
|
+
PROMPTS: dict[str, Any] = {}
|
|
32
|
+
PROMPTS_DESC: dict[str, Any] = {}
|
|
33
|
+
PROMPTS_CHUNK: dict[str, Any] = {}
|
|
34
|
+
PROMPTS_METRICS: dict[str, Any] = {}
|
|
35
|
+
PROMPTS_METRICS_OPT: dict[str, Any] = {}
|
|
36
|
+
|
|
37
|
+
# Delimiters for structured output parsing
|
|
38
|
+
PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|#|>"
|
|
39
|
+
PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|#|>END<|#|>"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# =============================================================================
|
|
43
|
+
# IMAGE/TABLE DESCRIPTION PROMPTS
|
|
44
|
+
# =============================================================================
|
|
45
|
+
|
|
46
|
+
PROMPTS_DESC["image"] = """
|
|
47
|
+
Generate a technical summary of the provided figure in a SINGLE PARAGRAPH (< 250 words).
|
|
48
|
+
|
|
49
|
+
Guidelines:
|
|
50
|
+
- Focus on technical data, relationships, and engineering principles
|
|
51
|
+
- Only describe visual attributes if they encode technical information
|
|
52
|
+
- Exclude non-technical content (watermarks, page numbers, decorative elements)
|
|
53
|
+
|
|
54
|
+
Structure (as continuous paragraph):
|
|
55
|
+
1. Figure type and engineering objective (one sentence)
|
|
56
|
+
2. Technical analysis:
|
|
57
|
+
- Plots/Charts: axes, units, variables, trends, operating regions
|
|
58
|
+
- Diagrams: components, connections, flow direction, system architecture
|
|
59
|
+
3. Key engineering insights and practical implications
|
|
60
|
+
|
|
61
|
+
[EXAMPLE_PLACEHOLDER: Provide domain-specific figure and expected response]
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
PROMPTS_DESC["table"] = """
|
|
65
|
+
Generate a technical summary of the provided table in a SINGLE PARAGRAPH (< 250 words).
|
|
66
|
+
|
|
67
|
+
Guidelines:
|
|
68
|
+
- Focus on data, specifications, and technical limits
|
|
69
|
+
- Do not reproduce or enumerate the table data
|
|
70
|
+
- Exclude generic headers/footers and document metadata
|
|
71
|
+
|
|
72
|
+
Structure (as continuous paragraph):
|
|
73
|
+
1. Table's engineering function (one sentence, like a caption)
|
|
74
|
+
2. Column/row organization and data ranges
|
|
75
|
+
3. Primary engineering application or conclusion
|
|
76
|
+
|
|
77
|
+
[EXAMPLE_PLACEHOLDER: Provide domain-specific table and expected response]
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# =============================================================================
|
|
82
|
+
# SEMANTIC CHUNKING PROMPTS
|
|
83
|
+
# =============================================================================
|
|
84
|
+
|
|
85
|
+
PROMPTS_CHUNK["semantic_chunking"] = """
|
|
86
|
+
You are a document parser for semantic chunking. Segment the markdown into coherent chunks.
|
|
87
|
+
|
|
88
|
+
## Processing Rules
|
|
89
|
+
|
|
90
|
+
1. **Exclusions**: Ignore Table of Contents, List of Figures, List of Tables
|
|
91
|
+
2. **Artifact Priority**: Identify figures, tables, standalone images first
|
|
92
|
+
3. **Semantic Cohesion**: Each chunk = single self-contained topic
|
|
93
|
+
4. **Avoid Fragmentation**: Prefer paragraph-level over sentence-level granularity
|
|
94
|
+
5. **Content Integrity**: Preserve exact verbatim markdown
|
|
95
|
+
|
|
96
|
+
## Chunk Types
|
|
97
|
+
|
|
98
|
+
| Type | Description |
|
|
99
|
+
|------|-------------|
|
|
100
|
+
| `text` | Textual sections/subsections with title and content |
|
|
101
|
+
| `table` | Table with caption, data, and footnotes |
|
|
102
|
+
| `table with images` | Table containing embedded images |
|
|
103
|
+
| `figure` | Image with caption and description |
|
|
104
|
+
| `standalone image` | Image not associated with a figure caption |
|
|
105
|
+
|
|
106
|
+
## Output Format
|
|
107
|
+
|
|
108
|
+
```
|
|
109
|
+
<chunk_id>[N]<|#|><chunk_type>[type]<|#|><content>[verbatim markdown]<|#|><artifact>[image path(s) or None]<|#|><status>[COMPLETE|INCOMPLETE]<|#|><chunk_end>
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Field Definitions
|
|
113
|
+
|
|
114
|
+
- `chunk_id`: Sequential number (1, 2, 3...)
|
|
115
|
+
- `chunk_type`: One of: text, table, table with images, figure, standalone image
|
|
116
|
+
- `content`: Exact unmodified markdown
|
|
117
|
+
- `artifact`: Image path(s) from `` syntax, or `None`
|
|
118
|
+
- `status`: COMPLETE if self-contained, INCOMPLETE if cut off
|
|
119
|
+
|
|
120
|
+
[EXAMPLE_PLACEHOLDER: Provide sample document and expected chunked output]
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
PROMPTS_CHUNK["completion_verification"] = """
|
|
125
|
+
You are a Chunk Completion Verification Agent. Evaluate if the chunk is semantically COMPLETE or INCOMPLETE.
|
|
126
|
+
|
|
127
|
+
Domain: {domain}
|
|
128
|
+
Expert Role: {expert_persona}
|
|
129
|
+
|
|
130
|
+
## Incompleteness Indicators
|
|
131
|
+
|
|
132
|
+
1. **Missing References**: "Figure X", "Table Y", "see Section Z" without actual content
|
|
133
|
+
2. **Undefined Terms**: Acronyms, classifications, or procedures used without definition
|
|
134
|
+
3. **Implicit Context**: "as mentioned earlier", "the previous method", "this configuration"
|
|
135
|
+
4. **Missing Artifacts**: Text describing a figure/table without the actual visual
|
|
136
|
+
|
|
137
|
+
## Exceptions (Mark COMPLETE)
|
|
138
|
+
|
|
139
|
+
- Standalone images with visible content
|
|
140
|
+
- Normative reference sections
|
|
141
|
+
- Universal abbreviations (kW, Hz, °C)
|
|
142
|
+
|
|
143
|
+
## Output Format
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
Status: COMPLETE|INCOMPLETE
|
|
147
|
+
Query: None|<search_query_1> | <search_query_2> | ...
|
|
148
|
+
Explanation: <brief explanation>
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Search queries must be specific enough to retrieve the missing content.
|
|
152
|
+
|
|
153
|
+
[EXAMPLE_PLACEHOLDER: Provide complete and incomplete chunk examples]
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
PROMPTS_CHUNK["chunk_addition_verification"] = """
|
|
158
|
+
You are a Chunk Addition Verification Agent. Classify if a CANDIDATE chunk should be added to build context.
|
|
159
|
+
|
|
160
|
+
Context: The ORIGINAL chunk is incomplete. A search found the CANDIDATE chunk.
|
|
161
|
+
|
|
162
|
+
Expert Role: {expert_persona}
|
|
163
|
+
Domain: {domain}
|
|
164
|
+
|
|
165
|
+
## Classification
|
|
166
|
+
|
|
167
|
+
**EXPLANATORY**: Directly resolves incompleteness
|
|
168
|
+
- Provides missing artifact (figure, table, formula)
|
|
169
|
+
- Defines undefined acronym/term
|
|
170
|
+
- Supplies referenced prior context
|
|
171
|
+
|
|
172
|
+
**RELATED**: Useful but doesn't directly resolve
|
|
173
|
+
- Same domain/topic, complementary information
|
|
174
|
+
- General theory or background
|
|
175
|
+
- Could enhance multi-hop QA generation
|
|
176
|
+
|
|
177
|
+
**UNRELATED**: No contribution
|
|
178
|
+
- Different domain with no connection
|
|
179
|
+
- No semantic overlap
|
|
180
|
+
|
|
181
|
+
## Output Format
|
|
182
|
+
|
|
183
|
+
```
|
|
184
|
+
Status: EXPLANATORY|RELATED|UNRELATED
|
|
185
|
+
Explanation: <brief justification>
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
[EXAMPLE_PLACEHOLDER: Provide original chunk, search query, and candidate chunk examples]
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
PROMPTS_CHUNK["relevance_check"] = """
|
|
193
|
+
Evaluate if the chunk is relevant to the specified expert role and domain.
|
|
194
|
+
|
|
195
|
+
Expert Role: {expert_persona}
|
|
196
|
+
Domain: {domain}
|
|
197
|
+
|
|
198
|
+
Chunk Content:
|
|
199
|
+
{content}
|
|
200
|
+
|
|
201
|
+
## RELEVANT if:
|
|
202
|
+
- Contains technical information, procedures, specifications useful for the role
|
|
203
|
+
- Includes figures, diagrams, charts conveying technical data
|
|
204
|
+
- Addresses topics within the domain expertise
|
|
205
|
+
|
|
206
|
+
## NOT_RELEVANT if:
|
|
207
|
+
- Only document metadata (titles, page numbers, copyright)
|
|
208
|
+
- Purely decorative content (logos, backgrounds)
|
|
209
|
+
- Completely unrelated to the domain
|
|
210
|
+
|
|
211
|
+
Respond with ONLY: "RELEVANT" or "NOT_RELEVANT"
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# =============================================================================
|
|
216
|
+
# DOMAIN AND EXPERT EXTRACTION
|
|
217
|
+
# =============================================================================
|
|
218
|
+
|
|
219
|
+
PROMPTS["domain_and_expert_from_topics"] = """
|
|
220
|
+
Analyze the following topics extracted from a technical document collection:
|
|
221
|
+
|
|
222
|
+
{topic_list_str}
|
|
223
|
+
|
|
224
|
+
Determine:
|
|
225
|
+
1. The specific technical/professional domain
|
|
226
|
+
2. An appropriate expert role title
|
|
227
|
+
|
|
228
|
+
## Output Format
|
|
229
|
+
|
|
230
|
+
```
|
|
231
|
+
<|#|>START<|#|>
|
|
232
|
+
<|#|>Domain: <domain name>
|
|
233
|
+
<|#|>Expert Role: <expert role title>
|
|
234
|
+
<|#|>END<|#|>
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
[EXAMPLE_PLACEHOLDER: Provide sample topics and expected domain/role output]
|
|
238
|
+
"""
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
# =============================================================================
|
|
242
|
+
# QA GENERATION PROMPTS
|
|
243
|
+
# =============================================================================
|
|
244
|
+
|
|
245
|
+
PROMPTS["question_answer_generation"] = """
|
|
246
|
+
You are a(n) {expert_persona} in {domain_context}. Generate QA pairs for evaluating information retrieval systems.
|
|
247
|
+
|
|
248
|
+
Content:
|
|
249
|
+
{content}
|
|
250
|
+
|
|
251
|
+
## Critical Requirements
|
|
252
|
+
|
|
253
|
+
1. **Content-Only**: Use ONLY information present in the content. NO external knowledge.
|
|
254
|
+
2. **Minimal Coverage**: Generate minimum questions to comprehensively span content without redundancy
|
|
255
|
+
3. **Role-Appropriate**: Questions suitable for {expert_persona} in {domain_relevance}
|
|
256
|
+
4. **Self-Contained**: Questions must be standalone without implicit references
|
|
257
|
+
5. **Non-Trivial**: Require specific content to answer, not general knowledge
|
|
258
|
+
|
|
259
|
+
## Forbidden Vague References
|
|
260
|
+
|
|
261
|
+
NEVER use:
|
|
262
|
+
- "the provided X", "the described X", "the specified X"
|
|
263
|
+
- "this/that/these/those X" without explicit identification
|
|
264
|
+
- "according to the content/document/text"
|
|
265
|
+
|
|
266
|
+
INSTEAD, explicitly name standards, figures, tables, sections.
|
|
267
|
+
|
|
268
|
+
## Output Format
|
|
269
|
+
|
|
270
|
+
```
|
|
271
|
+
<|#|>START<|#|>
|
|
272
|
+
Question<|#|><explicit, self-contained question><|#|>Answer<|#|><brief answer with specific references><|#|>Relevance<|#|><0-10><|#|>Difficulty<|#|><0-10>
|
|
273
|
+
<|#|>NEXT<|#|>
|
|
274
|
+
...
|
|
275
|
+
<|#|>END<|#|>
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
## Rating Scales
|
|
279
|
+
|
|
280
|
+
- **Relevance** (0-10): Importance to domain expert (0=irrelevant, 10=critical)
|
|
281
|
+
- **Difficulty** (0-10): Technical depth required (0=trivial, 10=expert insight)
|
|
282
|
+
|
|
283
|
+
[EXAMPLE_PLACEHOLDER: Provide content samples with correct and incorrect QA generation examples]
|
|
284
|
+
"""
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
PROMPTS["question_answer_selection"] = """
|
|
288
|
+
You are a QA Selection Agent ({expert_persona}, {domain_context}). Evaluate if a QA pair should be SELECTED or REJECTED.
|
|
289
|
+
|
|
290
|
+
Content:
|
|
291
|
+
{content}
|
|
292
|
+
|
|
293
|
+
Question: {question}
|
|
294
|
+
Answer: {answer}
|
|
295
|
+
|
|
296
|
+
## REJECT if:
|
|
297
|
+
1. **Improper References**: Vague references without explicit identification
|
|
298
|
+
2. **Vague Phrases**: "the provided/described/specified X", "this/that X"
|
|
299
|
+
3. **Trivial**: Answerable with general knowledge alone
|
|
300
|
+
4. **Non-Technical**: Document metadata, structure, formatting
|
|
301
|
+
5. **Ambiguous**: Unclear or requires unstated assumptions
|
|
302
|
+
6. **Out of Scope**: Irrelevant to {domain_relevance}
|
|
303
|
+
|
|
304
|
+
## SELECT if:
|
|
305
|
+
1. Self-contained and explicit references
|
|
306
|
+
2. Requires provided content to answer
|
|
307
|
+
3. Relevant to domain and appropriate difficulty
|
|
308
|
+
4. Demonstrates good technical depth
|
|
309
|
+
|
|
310
|
+
## Output Format
|
|
311
|
+
|
|
312
|
+
```
|
|
313
|
+
Status<|#|>SELECTED|REJECTED<|#|>Relevance<|#|><0-10><|#|>Difficulty<|#|><0-10><|#|>Reason<|#|><brief explanation>
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
[EXAMPLE_PLACEHOLDER: Provide selection and rejection examples]
|
|
317
|
+
"""
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
PROMPTS["question_answer_verification"] = """
|
|
321
|
+
You are a QA Verification Agent ({expert_persona}, {domain_context}). Verify the QA pair.
|
|
322
|
+
|
|
323
|
+
Content:
|
|
324
|
+
{content}
|
|
325
|
+
|
|
326
|
+
Question: {question}
|
|
327
|
+
Answer: {answer}
|
|
328
|
+
|
|
329
|
+
## Evaluation Criteria
|
|
330
|
+
|
|
331
|
+
1. Does the question involve specific content details?
|
|
332
|
+
2. Does the answer depend on information only in this content?
|
|
333
|
+
3. Can someone answer using only general knowledge?
|
|
334
|
+
4. For tables/images: Is the answer factually supported by the data?
|
|
335
|
+
|
|
336
|
+
## Vague Reference Check
|
|
337
|
+
|
|
338
|
+
QUESTION_INCORRECT if contains:
|
|
339
|
+
- "the provided/described/specified X"
|
|
340
|
+
- "this/that/these/those X" without identification
|
|
341
|
+
- References assuming reader has access to content
|
|
342
|
+
|
|
343
|
+
## Output Format
|
|
344
|
+
|
|
345
|
+
```
|
|
346
|
+
QUESTION_CORRECT|QUESTION_INCORRECT, ANSWER_CORRECT|ANSWER_INCORRECT, REQUIRES_CONTENT|CAN_ANSWER_WITHOUT_CONTENT
|
|
347
|
+
Justification: <brief explanation>
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
[EXAMPLE_PLACEHOLDER: Provide verification examples]
|
|
351
|
+
"""
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
PROMPTS["question_answer_generation_corrected"] = """
|
|
355
|
+
You are a(n) {expert_persona} in {domain_context}. Correct a failed QA pair.
|
|
356
|
+
|
|
357
|
+
Content:
|
|
358
|
+
{content}
|
|
359
|
+
|
|
360
|
+
## Failed QA and Feedback:
|
|
361
|
+
{failed_qa_feedback}
|
|
362
|
+
|
|
363
|
+
## Common Fixes
|
|
364
|
+
|
|
365
|
+
1. **Vague References**: Replace with explicit identifiers (document, section, figure names)
|
|
366
|
+
2. **Factual Errors**: Verify against provided content
|
|
367
|
+
3. **Too General**: Make specific to content information
|
|
368
|
+
4. **Hallucination**: Only reference what exists in content
|
|
369
|
+
|
|
370
|
+
## Output Format
|
|
371
|
+
|
|
372
|
+
```
|
|
373
|
+
<|#|>START<|#|>
|
|
374
|
+
Question<|#|><corrected question><|#|>Answer<|#|><corrected answer><|#|>Relevance<|#|><0-10><|#|>Difficulty<|#|><0-10>
|
|
375
|
+
<|#|>END<|#|>
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
If the original topic cannot be addressed with available content, return empty:
|
|
379
|
+
```
|
|
380
|
+
<|#|>START<|#|>
|
|
381
|
+
<|#|>END<|#|>
|
|
382
|
+
```
|
|
383
|
+
"""
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
# =============================================================================
|
|
387
|
+
# DEDUPLICATION PROMPTS
|
|
388
|
+
# =============================================================================
|
|
389
|
+
|
|
390
|
+
PROMPTS["deduplication_rank"] = """
|
|
391
|
+
You are a Data Curator ({expert_persona}, {domain}). Order similar QA pairs from least to most similar.
|
|
392
|
+
|
|
393
|
+
## Task
|
|
394
|
+
Order from "most distinct/unique" to "most redundant" relative to the cluster's core topic.
|
|
395
|
+
|
|
396
|
+
Cluster Candidates:
|
|
397
|
+
{candidates_text}
|
|
398
|
+
|
|
399
|
+
## Output Format
|
|
400
|
+
|
|
401
|
+
```
|
|
402
|
+
<|#|>START<|#|>
|
|
403
|
+
Question<|#|><ordered question 1><|#|>Answer<|#|><ordered answer 1>
|
|
404
|
+
<|#|>NEXT<|#|>
|
|
405
|
+
Question<|#|><ordered question 2><|#|>Answer<|#|><ordered answer 2>
|
|
406
|
+
...
|
|
407
|
+
<|#|>END<|#|>
|
|
408
|
+
```
|
|
409
|
+
|
|
410
|
+
Preserve all content exactly; only reorder.
|
|
411
|
+
"""
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
PROMPTS["deduplication_merge"] = """
|
|
415
|
+
You are a Data Curator ({expert_persona}, {domain}). Create minimal high-quality QA pairs from a cluster.
|
|
416
|
+
|
|
417
|
+
## Task
|
|
418
|
+
- Exact duplicates → single best version
|
|
419
|
+
- Different aspects → merge or keep distinct as appropriate
|
|
420
|
+
- Integrate related sub-questions into comprehensive pairs
|
|
421
|
+
|
|
422
|
+
Input Candidates:
|
|
423
|
+
{candidates_text}
|
|
424
|
+
|
|
425
|
+
## Output Format
|
|
426
|
+
|
|
427
|
+
```
|
|
428
|
+
<|#|>START<|#|>
|
|
429
|
+
Question<|#|><refined question><|#|>Answer<|#|><refined answer>
|
|
430
|
+
<|#|>NEXT<|#|>
|
|
431
|
+
...
|
|
432
|
+
<|#|>END<|#|>
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
[EXAMPLE_PLACEHOLDER: Provide merge examples]
|
|
436
|
+
"""
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
PROMPTS["deduplication_reorganize"] = """
|
|
440
|
+
You are a Data Curator ({expert_persona}, {domain}). Reorganize merged QAs into balanced packs.
|
|
441
|
+
|
|
442
|
+
## Guidelines
|
|
443
|
+
- Each pack: related questions sharing a theme (2-4 questions ideal)
|
|
444
|
+
- Single concept → one pack; multiple sub-topics → split into packs
|
|
445
|
+
- Answer in each pack should address all questions comprehensively
|
|
446
|
+
|
|
447
|
+
Input:
|
|
448
|
+
Merged Questions: <list>
|
|
449
|
+
Merged Answers: <list>
|
|
450
|
+
|
|
451
|
+
## Output Format
|
|
452
|
+
|
|
453
|
+
```
|
|
454
|
+
<|#|>START<|#|>
|
|
455
|
+
Question<|#|><related questions separated by newlines><|#|>Answer<|#|><comprehensive answer>
|
|
456
|
+
<|#|>NEXT<|#|>
|
|
457
|
+
...
|
|
458
|
+
<|#|>END<|#|>
|
|
459
|
+
```
|
|
460
|
+
|
|
461
|
+
[EXAMPLE_PLACEHOLDER: Provide reorganization examples]
|
|
462
|
+
"""
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
# =============================================================================
|
|
466
|
+
# RERANKER PROMPTS
|
|
467
|
+
# =============================================================================
|
|
468
|
+
|
|
469
|
+
PROMPTS["rerank_vlm"] = """
|
|
470
|
+
You are an expert retrieval system. Rank chunks by relevance to the query.
|
|
471
|
+
|
|
472
|
+
Each chunk is delimited by:
|
|
473
|
+
- `<CHUNK_START id=N>` ... `<CHUNK_END id=N>`
|
|
474
|
+
- Images: `<IMAGE_START id=X relates_to_chunk=N>` ... `<IMAGE_END id=X>`
|
|
475
|
+
|
|
476
|
+
## Instructions
|
|
477
|
+
1. Analyze text and image relevance
|
|
478
|
+
2. Rank from most relevant (Rank 1) to least relevant
|
|
479
|
+
|
|
480
|
+
## Output Format (exactly)
|
|
481
|
+
|
|
482
|
+
```
|
|
483
|
+
<Rank 1>Chunk X
|
|
484
|
+
<Rank 2>Chunk Y
|
|
485
|
+
<Rank 3>Chunk Z
|
|
486
|
+
...
|
|
487
|
+
```
|
|
488
|
+
|
|
489
|
+
Include ALL chunks. Only output chunk IDs, no content.
|
|
490
|
+
"""
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
PROMPTS["rerank_image_desc"] = """
|
|
494
|
+
Generate a concise 100-word technical description of this image. Focus on key technical information, data, and visual elements useful for retrieval.
|
|
495
|
+
"""
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
# =============================================================================
|
|
499
|
+
# METRICS EVALUATION PROMPTS
|
|
500
|
+
# =============================================================================
|
|
501
|
+
|
|
502
|
+
PROMPTS_METRICS["multihop_reasoning"] = """
|
|
503
|
+
Evaluate the QA pair's reasoning complexity.
|
|
504
|
+
|
|
505
|
+
Contexts: {contexts}
|
|
506
|
+
Question: {question}
|
|
507
|
+
Answer: {answer}
|
|
508
|
+
|
|
509
|
+
## Analysis
|
|
510
|
+
1. **Hop Count**: Distinct information pieces needed (1 = single fact, 2+ = multi-hop)
|
|
511
|
+
2. **Bridge Entity**: Concept/term connecting information pieces
|
|
512
|
+
3. **Reasoning Score**: 0.0 (trivial) to 1.0 (complex multi-step)
|
|
513
|
+
|
|
514
|
+
{format_instructions}
|
|
515
|
+
|
|
516
|
+
[EXAMPLE_PLACEHOLDER: Provide single-hop and multi-hop examples]
|
|
517
|
+
"""
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
PROMPTS_METRICS["visual_dependency"] = """
|
|
521
|
+
Determine if the question requires visual content to answer.
|
|
522
|
+
|
|
523
|
+
Context (Text Only): {contexts}
|
|
524
|
+
Question: {question}
|
|
525
|
+
|
|
526
|
+
## Instructions
|
|
527
|
+
- Answer using ONLY the text context
|
|
528
|
+
- No outside knowledge or hallucination
|
|
529
|
+
- If visual information is required but missing, output: `MISSING_VISUAL`
|
|
530
|
+
|
|
531
|
+
[EXAMPLE_PLACEHOLDER: Provide examples requiring and not requiring visuals]
|
|
532
|
+
"""
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
PROMPTS_METRICS["multimodal_faithfulness_vlm"] = """
|
|
536
|
+
Evaluate answer faithfulness given multimodal context.
|
|
537
|
+
|
|
538
|
+
Question: {question}
|
|
539
|
+
Answer: {answer}
|
|
540
|
+
|
|
541
|
+
## Analysis
|
|
542
|
+
1. Supported by TEXT content?
|
|
543
|
+
2. Supported by VISUAL content?
|
|
544
|
+
3. Any hallucinated/unsupported claims?
|
|
545
|
+
|
|
546
|
+
## Output Format
|
|
547
|
+
|
|
548
|
+
```
|
|
549
|
+
TEXT_SUPPORTED: YES/NO
|
|
550
|
+
VISUAL_SUPPORTED: YES/NO/NA
|
|
551
|
+
FAITHFULNESS_SCORE: 0.0-1.0
|
|
552
|
+
EXPLANATION: <brief>
|
|
553
|
+
```
|
|
554
|
+
"""
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
PROMPTS_METRICS["multimodal_answer_quality_vlm"] = """
|
|
558
|
+
Evaluate answer quality with multimodal context.
|
|
559
|
+
|
|
560
|
+
Question: {question}
|
|
561
|
+
Answer: {answer}
|
|
562
|
+
|
|
563
|
+
## Criteria
|
|
564
|
+
1. **Completeness**: Fully addresses question using all relevant context
|
|
565
|
+
2. **Accuracy**: Factually correct based on context
|
|
566
|
+
3. **Visual Info Used**: Incorporates visual elements (if relevant)
|
|
567
|
+
|
|
568
|
+
## Output Format
|
|
569
|
+
|
|
570
|
+
```
|
|
571
|
+
COMPLETENESS: 0.0-1.0
|
|
572
|
+
ACCURACY: 0.0-1.0
|
|
573
|
+
VISUAL_INFO_USED: YES/NO/NA
|
|
574
|
+
OVERALL_SCORE: 0.0-1.0
|
|
575
|
+
REASONING: <brief>
|
|
576
|
+
```
|
|
577
|
+
"""
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
PROMPTS_METRICS["context_necessity_without"] = """
|
|
581
|
+
Answer the question using ONLY general knowledge. Do NOT make up specific facts.
|
|
582
|
+
|
|
583
|
+
Question: {question}
|
|
584
|
+
|
|
585
|
+
If you cannot answer confidently without reference material, respond: "CANNOT_ANSWER_WITHOUT_CONTEXT"
|
|
586
|
+
|
|
587
|
+
Answer:
|
|
588
|
+
"""
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
PROMPTS_METRICS["context_necessity_verify"] = """
|
|
592
|
+
Compare two answers for semantic equivalence.
|
|
593
|
+
|
|
594
|
+
Ground Truth: {ground_truth}
|
|
595
|
+
Model Answer: {model_answer}
|
|
596
|
+
|
|
597
|
+
## Output Format
|
|
598
|
+
|
|
599
|
+
```
|
|
600
|
+
MATCH: YES/NO/PARTIAL
|
|
601
|
+
EXPLANATION: <brief>
|
|
602
|
+
```
|
|
603
|
+
"""
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
# =============================================================================
|
|
607
|
+
# OPTIMIZED METRICS PROMPTS (Minimal LLM Calls)
|
|
608
|
+
# =============================================================================
|
|
609
|
+
|
|
610
|
+
PROMPTS_METRICS_OPT["prepare_qa"] = """
|
|
611
|
+
Analyze the QA pair and extract evaluation components.
|
|
612
|
+
|
|
613
|
+
QUESTION: {question}
|
|
614
|
+
ANSWER: {answer}
|
|
615
|
+
REFERENCE: {reference}
|
|
616
|
+
|
|
617
|
+
## Tasks
|
|
618
|
+
1. Extract concept hops (concept1 --> concept2 --> ...)
|
|
619
|
+
2. Extract atomic claims from ANSWER
|
|
620
|
+
3. Extract atomic claims from REFERENCE
|
|
621
|
+
4. Generate {num_reverse_questions} questions the answer could address
|
|
622
|
+
|
|
623
|
+
## Output Format
|
|
624
|
+
|
|
625
|
+
```
|
|
626
|
+
CONCEPT_HOPS_QUESTION:
|
|
627
|
+
concept1 --> concept2 --> ...
|
|
628
|
+
|
|
629
|
+
ANSWER_CLAIMS:
|
|
630
|
+
- [claim 1]
|
|
631
|
+
- [claim 2]
|
|
632
|
+
...
|
|
633
|
+
|
|
634
|
+
REFERENCE_CLAIMS:
|
|
635
|
+
- [claim 1]
|
|
636
|
+
- [claim 2]
|
|
637
|
+
...
|
|
638
|
+
|
|
639
|
+
REVERSE_QUESTIONS:
|
|
640
|
+
- [question 1]
|
|
641
|
+
- [question 2]
|
|
642
|
+
...
|
|
643
|
+
```
|
|
644
|
+
|
|
645
|
+
[EXAMPLE_PLACEHOLDER: Provide QA analysis example]
|
|
646
|
+
"""
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
PROMPTS_METRICS_OPT["faithfulness"] = """
|
|
650
|
+
Verify if each claim can be inferred from the context.
|
|
651
|
+
|
|
652
|
+
CONTEXT:
|
|
653
|
+
{context}
|
|
654
|
+
|
|
655
|
+
CLAIMS TO VERIFY:
|
|
656
|
+
{claims}
|
|
657
|
+
|
|
658
|
+
## Output Format (one per line)
|
|
659
|
+
|
|
660
|
+
```
|
|
661
|
+
CLAIM_1: SUPPORTED/NOT_SUPPORTED
|
|
662
|
+
CLAIM_2: SUPPORTED/NOT_SUPPORTED
|
|
663
|
+
...
|
|
664
|
+
```
|
|
665
|
+
"""
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
PROMPTS_METRICS_OPT["context_recall"] = """
|
|
669
|
+
Verify if each reference claim can be attributed to the context.
|
|
670
|
+
|
|
671
|
+
CONTEXT:
|
|
672
|
+
{context}
|
|
673
|
+
|
|
674
|
+
REFERENCE CLAIMS:
|
|
675
|
+
{claims}
|
|
676
|
+
|
|
677
|
+
## Output Format (one per line)
|
|
678
|
+
|
|
679
|
+
```
|
|
680
|
+
CLAIM_1: ATTRIBUTED/NOT_ATTRIBUTED
|
|
681
|
+
CLAIM_2: ATTRIBUTED/NOT_ATTRIBUTED
|
|
682
|
+
...
|
|
683
|
+
```
|
|
684
|
+
"""
|
|
685
|
+
|
|
686
|
+
|
|
687
|
+
PROMPTS_METRICS_OPT["context_precision"] = """
|
|
688
|
+
Evaluate context chunk relevance for answering the question.
|
|
689
|
+
|
|
690
|
+
QUESTION: {question}
|
|
691
|
+
REFERENCE ANSWER: {reference}
|
|
692
|
+
|
|
693
|
+
CONTEXT CHUNKS:
|
|
694
|
+
{contexts}
|
|
695
|
+
|
|
696
|
+
## Output Format (one per line)
|
|
697
|
+
|
|
698
|
+
```
|
|
699
|
+
CHUNK_1: RELEVANT/NOT_RELEVANT
|
|
700
|
+
CHUNK_2: RELEVANT/NOT_RELEVANT
|
|
701
|
+
...
|
|
702
|
+
```
|
|
703
|
+
"""
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
PROMPTS_METRICS_OPT["multimodal_faithfulness"] = """
|
|
707
|
+
Verify claims against multimodal context (text AND images).
|
|
708
|
+
|
|
709
|
+
QUESTION: {question}
|
|
710
|
+
ANSWER: {answer}
|
|
711
|
+
|
|
712
|
+
CLAIMS TO VERIFY:
|
|
713
|
+
{claims}
|
|
714
|
+
|
|
715
|
+
## Output Format
|
|
716
|
+
|
|
717
|
+
```
|
|
718
|
+
CLAIM_1: SUPPORTED/NOT_SUPPORTED | SOURCE: TEXT/IMAGE/BOTH/NONE
|
|
719
|
+
CLAIM_2: SUPPORTED/NOT_SUPPORTED | SOURCE: TEXT/IMAGE/BOTH/NONE
|
|
720
|
+
...
|
|
721
|
+
|
|
722
|
+
SUMMARY:
|
|
723
|
+
TEXT_GROUNDED: YES/NO
|
|
724
|
+
VISUAL_GROUNDED: YES/NO/NA
|
|
725
|
+
SUPPORTED_COUNT: [number]
|
|
726
|
+
TOTAL_CLAIMS: [number]
|
|
727
|
+
```
|
|
728
|
+
"""
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
PROMPTS_METRICS_OPT["multimodal_relevance"] = """
|
|
732
|
+
Generate questions the answer could address and evaluate context utilization.
|
|
733
|
+
|
|
734
|
+
ANSWER: {answer}
|
|
735
|
+
|
|
736
|
+
## Tasks
|
|
737
|
+
1. Generate {num_questions} diverse questions this answer could address
|
|
738
|
+
2. Indicate if each uses TEXT, IMAGE, or BOTH context
|
|
739
|
+
|
|
740
|
+
## Output Format
|
|
741
|
+
|
|
742
|
+
```
|
|
743
|
+
GENERATED_QUESTIONS:
|
|
744
|
+
Q1: [question] | USES: TEXT/IMAGE/BOTH
|
|
745
|
+
Q2: [question] | USES: TEXT/IMAGE/BOTH
|
|
746
|
+
...
|
|
747
|
+
|
|
748
|
+
CONTEXT_UTILIZATION:
|
|
749
|
+
USES_TEXT: YES/NO
|
|
750
|
+
USES_IMAGES: YES/NO/NA
|
|
751
|
+
RELEVANCE_SCORE: 0.0-1.0
|
|
752
|
+
```
|
|
753
|
+
"""
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
PROMPTS_METRICS_OPT["context_necessity_without"] = """
|
|
757
|
+
Answer using ONLY general knowledge. Do NOT fabricate specific facts.
|
|
758
|
+
|
|
759
|
+
If you cannot answer confidently, respond: CANNOT_ANSWER
|
|
760
|
+
|
|
761
|
+
QUESTION: {question}
|
|
762
|
+
|
|
763
|
+
YOUR ANSWER:
|
|
764
|
+
"""
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
PROMPTS_METRICS_OPT["context_necessity_verify"] = """
|
|
768
|
+
Compare model answer to ground truth.
|
|
769
|
+
|
|
770
|
+
GROUND TRUTH: {ground_truth}
|
|
771
|
+
MODEL ANSWER: {model_answer}
|
|
772
|
+
|
|
773
|
+
Respond with exactly one of:
|
|
774
|
+
- MATCH: YES (correct and complete)
|
|
775
|
+
- MATCH: PARTIAL (partially correct)
|
|
776
|
+
- MATCH: NO (incorrect or missing key information)
|
|
777
|
+
|
|
778
|
+
YOUR VERDICT:
|
|
779
|
+
"""
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
PROMPTS_METRICS_OPT["multihop_reasoning"] = """
|
|
783
|
+
Analyze if answering requires multi-hop reasoning.
|
|
784
|
+
|
|
785
|
+
CONTEXTS:
|
|
786
|
+
{contexts}
|
|
787
|
+
|
|
788
|
+
QUESTION: {question}
|
|
789
|
+
ANSWER: {answer}
|
|
790
|
+
|
|
791
|
+
## Output Format
|
|
792
|
+
|
|
793
|
+
```
|
|
794
|
+
HOP_COUNT: [number]
|
|
795
|
+
REASONING_SCORE: 0.0-1.0
|
|
796
|
+
BRIDGE_ENTITY: [entity or None]
|
|
797
|
+
EXPLANATION: <brief>
|
|
798
|
+
```
|
|
799
|
+
|
|
800
|
+
- HOP_COUNT: 1 = single fact, 2+ = multi-hop
|
|
801
|
+
- REASONING_SCORE: 0.0 = trivial, 1.0 = complex
|
|
802
|
+
"""
|
|
803
|
+
|
|
804
|
+
|
|
805
|
+
PROMPTS_METRICS_OPT["visual_dependency"] = """
|
|
806
|
+
Determine if the question can be answered from text alone.
|
|
807
|
+
|
|
808
|
+
TEXT CONTEXT:
|
|
809
|
+
{contexts}
|
|
810
|
+
|
|
811
|
+
QUESTION: {question}
|
|
812
|
+
|
|
813
|
+
If you can answer completely from text, provide your answer.
|
|
814
|
+
If visual information is missing and required, respond: MISSING_VISUAL
|
|
815
|
+
|
|
816
|
+
YOUR RESPONSE:
|
|
817
|
+
"""
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
# =============================================================================
|
|
821
|
+
# UTILITY FUNCTIONS
|
|
822
|
+
# =============================================================================
|
|
823
|
+
|
|
824
|
+
def get_prompt(category: str, name: str, **kwargs) -> str:
|
|
825
|
+
"""
|
|
826
|
+
Retrieve and format a prompt template.
|
|
827
|
+
|
|
828
|
+
Args:
|
|
829
|
+
category: Prompt category ('prompts', 'desc', 'chunk', 'metrics', 'metrics_opt')
|
|
830
|
+
name: Prompt name within category
|
|
831
|
+
**kwargs: Template variables to substitute
|
|
832
|
+
|
|
833
|
+
Returns:
|
|
834
|
+
Formatted prompt string
|
|
835
|
+
"""
|
|
836
|
+
prompt_dicts = {
|
|
837
|
+
'prompts': PROMPTS,
|
|
838
|
+
'desc': PROMPTS_DESC,
|
|
839
|
+
'chunk': PROMPTS_CHUNK,
|
|
840
|
+
'metrics': PROMPTS_METRICS,
|
|
841
|
+
'metrics_opt': PROMPTS_METRICS_OPT
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
if category not in prompt_dicts:
|
|
845
|
+
raise ValueError(f"Unknown category: {category}")
|
|
846
|
+
|
|
847
|
+
prompt_dict = prompt_dicts[category]
|
|
848
|
+
|
|
849
|
+
if name not in prompt_dict:
|
|
850
|
+
raise ValueError(f"Unknown prompt: {name} in category {category}")
|
|
851
|
+
|
|
852
|
+
template = prompt_dict[name]
|
|
853
|
+
|
|
854
|
+
if kwargs:
|
|
855
|
+
return template.format(**kwargs)
|
|
856
|
+
|
|
857
|
+
return template
|
|
858
|
+
|
|
859
|
+
|
|
860
|
+
def list_prompts() -> dict[str, list[str]]:
|
|
861
|
+
"""List all available prompts by category."""
|
|
862
|
+
return {
|
|
863
|
+
'prompts': list(PROMPTS.keys()),
|
|
864
|
+
'desc': list(PROMPTS_DESC.keys()),
|
|
865
|
+
'chunk': list(PROMPTS_CHUNK.keys()),
|
|
866
|
+
'metrics': list(PROMPTS_METRICS.keys()),
|
|
867
|
+
'metrics_opt': list(PROMPTS_METRICS_OPT.keys())
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
# =============================================================================
|
|
872
|
+
# MODULE INFO
|
|
873
|
+
# =============================================================================
|
|
874
|
+
|
|
875
|
+
__version__ = "1.0.0"
|
|
876
|
+
__all__ = [
|
|
877
|
+
"PROMPTS",
|
|
878
|
+
"PROMPTS_DESC",
|
|
879
|
+
"PROMPTS_CHUNK",
|
|
880
|
+
"PROMPTS_METRICS",
|
|
881
|
+
"PROMPTS_METRICS_OPT",
|
|
882
|
+
"get_prompt",
|
|
883
|
+
"list_prompts"
|
|
884
|
+
]
|