mirage-benchmark 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mirage-benchmark might be problematic. Click here for more details.

mirage/core/prompts.py ADDED
@@ -0,0 +1,884 @@
1
+ """
2
+ Multimodal RAG Evaluation Dataset Generation Prompts
3
+ =====================================================
4
+
5
+ A collection of prompts for generating high-quality Question-Answer datasets
6
+ from multimodal technical documents. Designed for RAG system evaluation with
7
+ support for text, tables, figures, and images.
8
+
9
+ This module provides prompts for:
10
+ - Document semantic chunking
11
+ - Chunk completeness verification
12
+ - QA pair generation and verification
13
+ - Deduplication and merging
14
+ - Retrieval metrics evaluation (faithfulness, precision, recall)
15
+ - Multimodal content handling
16
+
17
+ Usage:
18
+ from prompt import PROMPTS, PROMPTS_DESC, PROMPTS_CHUNK, PROMPTS_METRICS
19
+
20
+ Author: [Your Name/Organization]
21
+ License: Apache 2.0
22
+ """
23
+
24
+ from __future__ import annotations
25
+ from typing import Any
26
+
27
+ # =============================================================================
28
+ # CONFIGURATION
29
+ # =============================================================================
30
+
31
+ PROMPTS: dict[str, Any] = {}
32
+ PROMPTS_DESC: dict[str, Any] = {}
33
+ PROMPTS_CHUNK: dict[str, Any] = {}
34
+ PROMPTS_METRICS: dict[str, Any] = {}
35
+ PROMPTS_METRICS_OPT: dict[str, Any] = {}
36
+
37
+ # Delimiters for structured output parsing
38
+ PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|#|>"
39
+ PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|#|>END<|#|>"
40
+
41
+
42
+ # =============================================================================
43
+ # IMAGE/TABLE DESCRIPTION PROMPTS
44
+ # =============================================================================
45
+
46
+ PROMPTS_DESC["image"] = """
47
+ Generate a technical summary of the provided figure in a SINGLE PARAGRAPH (< 250 words).
48
+
49
+ Guidelines:
50
+ - Focus on technical data, relationships, and engineering principles
51
+ - Only describe visual attributes if they encode technical information
52
+ - Exclude non-technical content (watermarks, page numbers, decorative elements)
53
+
54
+ Structure (as continuous paragraph):
55
+ 1. Figure type and engineering objective (one sentence)
56
+ 2. Technical analysis:
57
+ - Plots/Charts: axes, units, variables, trends, operating regions
58
+ - Diagrams: components, connections, flow direction, system architecture
59
+ 3. Key engineering insights and practical implications
60
+
61
+ [EXAMPLE_PLACEHOLDER: Provide domain-specific figure and expected response]
62
+ """
63
+
64
+ PROMPTS_DESC["table"] = """
65
+ Generate a technical summary of the provided table in a SINGLE PARAGRAPH (< 250 words).
66
+
67
+ Guidelines:
68
+ - Focus on data, specifications, and technical limits
69
+ - Do not reproduce or enumerate the table data
70
+ - Exclude generic headers/footers and document metadata
71
+
72
+ Structure (as continuous paragraph):
73
+ 1. Table's engineering function (one sentence, like a caption)
74
+ 2. Column/row organization and data ranges
75
+ 3. Primary engineering application or conclusion
76
+
77
+ [EXAMPLE_PLACEHOLDER: Provide domain-specific table and expected response]
78
+ """
79
+
80
+
81
+ # =============================================================================
82
+ # SEMANTIC CHUNKING PROMPTS
83
+ # =============================================================================
84
+
85
+ PROMPTS_CHUNK["semantic_chunking"] = """
86
+ You are a document parser for semantic chunking. Segment the markdown into coherent chunks.
87
+
88
+ ## Processing Rules
89
+
90
+ 1. **Exclusions**: Ignore Table of Contents, List of Figures, List of Tables
91
+ 2. **Artifact Priority**: Identify figures, tables, standalone images first
92
+ 3. **Semantic Cohesion**: Each chunk = single self-contained topic
93
+ 4. **Avoid Fragmentation**: Prefer paragraph-level over sentence-level granularity
94
+ 5. **Content Integrity**: Preserve exact verbatim markdown
95
+
96
+ ## Chunk Types
97
+
98
+ | Type | Description |
99
+ |------|-------------|
100
+ | `text` | Textual sections/subsections with title and content |
101
+ | `table` | Table with caption, data, and footnotes |
102
+ | `table with images` | Table containing embedded images |
103
+ | `figure` | Image with caption and description |
104
+ | `standalone image` | Image not associated with a figure caption |
105
+
106
+ ## Output Format
107
+
108
+ ```
109
+ <chunk_id>[N]<|#|><chunk_type>[type]<|#|><content>[verbatim markdown]<|#|><artifact>[image path(s) or None]<|#|><status>[COMPLETE|INCOMPLETE]<|#|><chunk_end>
110
+ ```
111
+
112
+ ## Field Definitions
113
+
114
+ - `chunk_id`: Sequential number (1, 2, 3...)
115
+ - `chunk_type`: One of: text, table, table with images, figure, standalone image
116
+ - `content`: Exact unmodified markdown
117
+ - `artifact`: Image path(s) from `![alt](path)` syntax, or `None`
118
+ - `status`: COMPLETE if self-contained, INCOMPLETE if cut off
119
+
120
+ [EXAMPLE_PLACEHOLDER: Provide sample document and expected chunked output]
121
+ """
122
+
123
+
124
+ PROMPTS_CHUNK["completion_verification"] = """
125
+ You are a Chunk Completion Verification Agent. Evaluate if the chunk is semantically COMPLETE or INCOMPLETE.
126
+
127
+ Domain: {domain}
128
+ Expert Role: {expert_persona}
129
+
130
+ ## Incompleteness Indicators
131
+
132
+ 1. **Missing References**: "Figure X", "Table Y", "see Section Z" without actual content
133
+ 2. **Undefined Terms**: Acronyms, classifications, or procedures used without definition
134
+ 3. **Implicit Context**: "as mentioned earlier", "the previous method", "this configuration"
135
+ 4. **Missing Artifacts**: Text describing a figure/table without the actual visual
136
+
137
+ ## Exceptions (Mark COMPLETE)
138
+
139
+ - Standalone images with visible content
140
+ - Normative reference sections
141
+ - Universal abbreviations (kW, Hz, °C)
142
+
143
+ ## Output Format
144
+
145
+ ```
146
+ Status: COMPLETE|INCOMPLETE
147
+ Query: None|<search_query_1> | <search_query_2> | ...
148
+ Explanation: <brief explanation>
149
+ ```
150
+
151
+ Search queries must be specific enough to retrieve the missing content.
152
+
153
+ [EXAMPLE_PLACEHOLDER: Provide complete and incomplete chunk examples]
154
+ """
155
+
156
+
157
+ PROMPTS_CHUNK["chunk_addition_verification"] = """
158
+ You are a Chunk Addition Verification Agent. Classify if a CANDIDATE chunk should be added to build context.
159
+
160
+ Context: The ORIGINAL chunk is incomplete. A search found the CANDIDATE chunk.
161
+
162
+ Expert Role: {expert_persona}
163
+ Domain: {domain}
164
+
165
+ ## Classification
166
+
167
+ **EXPLANATORY**: Directly resolves incompleteness
168
+ - Provides missing artifact (figure, table, formula)
169
+ - Defines undefined acronym/term
170
+ - Supplies referenced prior context
171
+
172
+ **RELATED**: Useful but doesn't directly resolve
173
+ - Same domain/topic, complementary information
174
+ - General theory or background
175
+ - Could enhance multi-hop QA generation
176
+
177
+ **UNRELATED**: No contribution
178
+ - Different domain with no connection
179
+ - No semantic overlap
180
+
181
+ ## Output Format
182
+
183
+ ```
184
+ Status: EXPLANATORY|RELATED|UNRELATED
185
+ Explanation: <brief justification>
186
+ ```
187
+
188
+ [EXAMPLE_PLACEHOLDER: Provide original chunk, search query, and candidate chunk examples]
189
+ """
190
+
191
+
192
+ PROMPTS_CHUNK["relevance_check"] = """
193
+ Evaluate if the chunk is relevant to the specified expert role and domain.
194
+
195
+ Expert Role: {expert_persona}
196
+ Domain: {domain}
197
+
198
+ Chunk Content:
199
+ {content}
200
+
201
+ ## RELEVANT if:
202
+ - Contains technical information, procedures, specifications useful for the role
203
+ - Includes figures, diagrams, charts conveying technical data
204
+ - Addresses topics within the domain expertise
205
+
206
+ ## NOT_RELEVANT if:
207
+ - Only document metadata (titles, page numbers, copyright)
208
+ - Purely decorative content (logos, backgrounds)
209
+ - Completely unrelated to the domain
210
+
211
+ Respond with ONLY: "RELEVANT" or "NOT_RELEVANT"
212
+ """
213
+
214
+
215
+ # =============================================================================
216
+ # DOMAIN AND EXPERT EXTRACTION
217
+ # =============================================================================
218
+
219
+ PROMPTS["domain_and_expert_from_topics"] = """
220
+ Analyze the following topics extracted from a technical document collection:
221
+
222
+ {topic_list_str}
223
+
224
+ Determine:
225
+ 1. The specific technical/professional domain
226
+ 2. An appropriate expert role title
227
+
228
+ ## Output Format
229
+
230
+ ```
231
+ <|#|>START<|#|>
232
+ <|#|>Domain: <domain name>
233
+ <|#|>Expert Role: <expert role title>
234
+ <|#|>END<|#|>
235
+ ```
236
+
237
+ [EXAMPLE_PLACEHOLDER: Provide sample topics and expected domain/role output]
238
+ """
239
+
240
+
241
+ # =============================================================================
242
+ # QA GENERATION PROMPTS
243
+ # =============================================================================
244
+
245
+ PROMPTS["question_answer_generation"] = """
246
+ You are a(n) {expert_persona} in {domain_context}. Generate QA pairs for evaluating information retrieval systems.
247
+
248
+ Content:
249
+ {content}
250
+
251
+ ## Critical Requirements
252
+
253
+ 1. **Content-Only**: Use ONLY information present in the content. NO external knowledge.
254
+ 2. **Minimal Coverage**: Generate minimum questions to comprehensively span content without redundancy
255
+ 3. **Role-Appropriate**: Questions suitable for {expert_persona} in {domain_relevance}
256
+ 4. **Self-Contained**: Questions must be standalone without implicit references
257
+ 5. **Non-Trivial**: Require specific content to answer, not general knowledge
258
+
259
+ ## Forbidden Vague References
260
+
261
+ NEVER use:
262
+ - "the provided X", "the described X", "the specified X"
263
+ - "this/that/these/those X" without explicit identification
264
+ - "according to the content/document/text"
265
+
266
+ INSTEAD, explicitly name standards, figures, tables, sections.
267
+
268
+ ## Output Format
269
+
270
+ ```
271
+ <|#|>START<|#|>
272
+ Question<|#|><explicit, self-contained question><|#|>Answer<|#|><brief answer with specific references><|#|>Relevance<|#|><0-10><|#|>Difficulty<|#|><0-10>
273
+ <|#|>NEXT<|#|>
274
+ ...
275
+ <|#|>END<|#|>
276
+ ```
277
+
278
+ ## Rating Scales
279
+
280
+ - **Relevance** (0-10): Importance to domain expert (0=irrelevant, 10=critical)
281
+ - **Difficulty** (0-10): Technical depth required (0=trivial, 10=expert insight)
282
+
283
+ [EXAMPLE_PLACEHOLDER: Provide content samples with correct and incorrect QA generation examples]
284
+ """
285
+
286
+
287
+ PROMPTS["question_answer_selection"] = """
288
+ You are a QA Selection Agent ({expert_persona}, {domain_context}). Evaluate if a QA pair should be SELECTED or REJECTED.
289
+
290
+ Content:
291
+ {content}
292
+
293
+ Question: {question}
294
+ Answer: {answer}
295
+
296
+ ## REJECT if:
297
+ 1. **Improper References**: Vague references without explicit identification
298
+ 2. **Vague Phrases**: "the provided/described/specified X", "this/that X"
299
+ 3. **Trivial**: Answerable with general knowledge alone
300
+ 4. **Non-Technical**: Document metadata, structure, formatting
301
+ 5. **Ambiguous**: Unclear or requires unstated assumptions
302
+ 6. **Out of Scope**: Irrelevant to {domain_relevance}
303
+
304
+ ## SELECT if:
305
+ 1. Self-contained and explicit references
306
+ 2. Requires provided content to answer
307
+ 3. Relevant to domain and appropriate difficulty
308
+ 4. Demonstrates good technical depth
309
+
310
+ ## Output Format
311
+
312
+ ```
313
+ Status<|#|>SELECTED|REJECTED<|#|>Relevance<|#|><0-10><|#|>Difficulty<|#|><0-10><|#|>Reason<|#|><brief explanation>
314
+ ```
315
+
316
+ [EXAMPLE_PLACEHOLDER: Provide selection and rejection examples]
317
+ """
318
+
319
+
320
+ PROMPTS["question_answer_verification"] = """
321
+ You are a QA Verification Agent ({expert_persona}, {domain_context}). Verify the QA pair.
322
+
323
+ Content:
324
+ {content}
325
+
326
+ Question: {question}
327
+ Answer: {answer}
328
+
329
+ ## Evaluation Criteria
330
+
331
+ 1. Does the question involve specific content details?
332
+ 2. Does the answer depend on information only in this content?
333
+ 3. Can someone answer using only general knowledge?
334
+ 4. For tables/images: Is the answer factually supported by the data?
335
+
336
+ ## Vague Reference Check
337
+
338
+ QUESTION_INCORRECT if contains:
339
+ - "the provided/described/specified X"
340
+ - "this/that/these/those X" without identification
341
+ - References assuming reader has access to content
342
+
343
+ ## Output Format
344
+
345
+ ```
346
+ QUESTION_CORRECT|QUESTION_INCORRECT, ANSWER_CORRECT|ANSWER_INCORRECT, REQUIRES_CONTENT|CAN_ANSWER_WITHOUT_CONTENT
347
+ Justification: <brief explanation>
348
+ ```
349
+
350
+ [EXAMPLE_PLACEHOLDER: Provide verification examples]
351
+ """
352
+
353
+
354
+ PROMPTS["question_answer_generation_corrected"] = """
355
+ You are a(n) {expert_persona} in {domain_context}. Correct a failed QA pair.
356
+
357
+ Content:
358
+ {content}
359
+
360
+ ## Failed QA and Feedback:
361
+ {failed_qa_feedback}
362
+
363
+ ## Common Fixes
364
+
365
+ 1. **Vague References**: Replace with explicit identifiers (document, section, figure names)
366
+ 2. **Factual Errors**: Verify against provided content
367
+ 3. **Too General**: Make specific to content information
368
+ 4. **Hallucination**: Only reference what exists in content
369
+
370
+ ## Output Format
371
+
372
+ ```
373
+ <|#|>START<|#|>
374
+ Question<|#|><corrected question><|#|>Answer<|#|><corrected answer><|#|>Relevance<|#|><0-10><|#|>Difficulty<|#|><0-10>
375
+ <|#|>END<|#|>
376
+ ```
377
+
378
+ If the original topic cannot be addressed with available content, return empty:
379
+ ```
380
+ <|#|>START<|#|>
381
+ <|#|>END<|#|>
382
+ ```
383
+ """
384
+
385
+
386
+ # =============================================================================
387
+ # DEDUPLICATION PROMPTS
388
+ # =============================================================================
389
+
390
+ PROMPTS["deduplication_rank"] = """
391
+ You are a Data Curator ({expert_persona}, {domain}). Order similar QA pairs from least to most similar.
392
+
393
+ ## Task
394
+ Order from "most distinct/unique" to "most redundant" relative to the cluster's core topic.
395
+
396
+ Cluster Candidates:
397
+ {candidates_text}
398
+
399
+ ## Output Format
400
+
401
+ ```
402
+ <|#|>START<|#|>
403
+ Question<|#|><ordered question 1><|#|>Answer<|#|><ordered answer 1>
404
+ <|#|>NEXT<|#|>
405
+ Question<|#|><ordered question 2><|#|>Answer<|#|><ordered answer 2>
406
+ ...
407
+ <|#|>END<|#|>
408
+ ```
409
+
410
+ Preserve all content exactly; only reorder.
411
+ """
412
+
413
+
414
+ PROMPTS["deduplication_merge"] = """
415
+ You are a Data Curator ({expert_persona}, {domain}). Create minimal high-quality QA pairs from a cluster.
416
+
417
+ ## Task
418
+ - Exact duplicates → single best version
419
+ - Different aspects → merge or keep distinct as appropriate
420
+ - Integrate related sub-questions into comprehensive pairs
421
+
422
+ Input Candidates:
423
+ {candidates_text}
424
+
425
+ ## Output Format
426
+
427
+ ```
428
+ <|#|>START<|#|>
429
+ Question<|#|><refined question><|#|>Answer<|#|><refined answer>
430
+ <|#|>NEXT<|#|>
431
+ ...
432
+ <|#|>END<|#|>
433
+ ```
434
+
435
+ [EXAMPLE_PLACEHOLDER: Provide merge examples]
436
+ """
437
+
438
+
439
+ PROMPTS["deduplication_reorganize"] = """
440
+ You are a Data Curator ({expert_persona}, {domain}). Reorganize merged QAs into balanced packs.
441
+
442
+ ## Guidelines
443
+ - Each pack: related questions sharing a theme (2-4 questions ideal)
444
+ - Single concept → one pack; multiple sub-topics → split into packs
445
+ - Answer in each pack should address all questions comprehensively
446
+
447
+ Input:
448
+ Merged Questions: <list>
449
+ Merged Answers: <list>
450
+
451
+ ## Output Format
452
+
453
+ ```
454
+ <|#|>START<|#|>
455
+ Question<|#|><related questions separated by newlines><|#|>Answer<|#|><comprehensive answer>
456
+ <|#|>NEXT<|#|>
457
+ ...
458
+ <|#|>END<|#|>
459
+ ```
460
+
461
+ [EXAMPLE_PLACEHOLDER: Provide reorganization examples]
462
+ """
463
+
464
+
465
+ # =============================================================================
466
+ # RERANKER PROMPTS
467
+ # =============================================================================
468
+
469
+ PROMPTS["rerank_vlm"] = """
470
+ You are an expert retrieval system. Rank chunks by relevance to the query.
471
+
472
+ Each chunk is delimited by:
473
+ - `<CHUNK_START id=N>` ... `<CHUNK_END id=N>`
474
+ - Images: `<IMAGE_START id=X relates_to_chunk=N>` ... `<IMAGE_END id=X>`
475
+
476
+ ## Instructions
477
+ 1. Analyze text and image relevance
478
+ 2. Rank from most relevant (Rank 1) to least relevant
479
+
480
+ ## Output Format (exactly)
481
+
482
+ ```
483
+ <Rank 1>Chunk X
484
+ <Rank 2>Chunk Y
485
+ <Rank 3>Chunk Z
486
+ ...
487
+ ```
488
+
489
+ Include ALL chunks. Only output chunk IDs, no content.
490
+ """
491
+
492
+
493
+ PROMPTS["rerank_image_desc"] = """
494
+ Generate a concise 100-word technical description of this image. Focus on key technical information, data, and visual elements useful for retrieval.
495
+ """
496
+
497
+
498
+ # =============================================================================
499
+ # METRICS EVALUATION PROMPTS
500
+ # =============================================================================
501
+
502
+ PROMPTS_METRICS["multihop_reasoning"] = """
503
+ Evaluate the QA pair's reasoning complexity.
504
+
505
+ Contexts: {contexts}
506
+ Question: {question}
507
+ Answer: {answer}
508
+
509
+ ## Analysis
510
+ 1. **Hop Count**: Distinct information pieces needed (1 = single fact, 2+ = multi-hop)
511
+ 2. **Bridge Entity**: Concept/term connecting information pieces
512
+ 3. **Reasoning Score**: 0.0 (trivial) to 1.0 (complex multi-step)
513
+
514
+ {format_instructions}
515
+
516
+ [EXAMPLE_PLACEHOLDER: Provide single-hop and multi-hop examples]
517
+ """
518
+
519
+
520
+ PROMPTS_METRICS["visual_dependency"] = """
521
+ Determine if the question requires visual content to answer.
522
+
523
+ Context (Text Only): {contexts}
524
+ Question: {question}
525
+
526
+ ## Instructions
527
+ - Answer using ONLY the text context
528
+ - No outside knowledge or hallucination
529
+ - If visual information is required but missing, output: `MISSING_VISUAL`
530
+
531
+ [EXAMPLE_PLACEHOLDER: Provide examples requiring and not requiring visuals]
532
+ """
533
+
534
+
535
+ PROMPTS_METRICS["multimodal_faithfulness_vlm"] = """
536
+ Evaluate answer faithfulness given multimodal context.
537
+
538
+ Question: {question}
539
+ Answer: {answer}
540
+
541
+ ## Analysis
542
+ 1. Supported by TEXT content?
543
+ 2. Supported by VISUAL content?
544
+ 3. Any hallucinated/unsupported claims?
545
+
546
+ ## Output Format
547
+
548
+ ```
549
+ TEXT_SUPPORTED: YES/NO
550
+ VISUAL_SUPPORTED: YES/NO/NA
551
+ FAITHFULNESS_SCORE: 0.0-1.0
552
+ EXPLANATION: <brief>
553
+ ```
554
+ """
555
+
556
+
557
+ PROMPTS_METRICS["multimodal_answer_quality_vlm"] = """
558
+ Evaluate answer quality with multimodal context.
559
+
560
+ Question: {question}
561
+ Answer: {answer}
562
+
563
+ ## Criteria
564
+ 1. **Completeness**: Fully addresses question using all relevant context
565
+ 2. **Accuracy**: Factually correct based on context
566
+ 3. **Visual Info Used**: Incorporates visual elements (if relevant)
567
+
568
+ ## Output Format
569
+
570
+ ```
571
+ COMPLETENESS: 0.0-1.0
572
+ ACCURACY: 0.0-1.0
573
+ VISUAL_INFO_USED: YES/NO/NA
574
+ OVERALL_SCORE: 0.0-1.0
575
+ REASONING: <brief>
576
+ ```
577
+ """
578
+
579
+
580
+ PROMPTS_METRICS["context_necessity_without"] = """
581
+ Answer the question using ONLY general knowledge. Do NOT make up specific facts.
582
+
583
+ Question: {question}
584
+
585
+ If you cannot answer confidently without reference material, respond: "CANNOT_ANSWER_WITHOUT_CONTEXT"
586
+
587
+ Answer:
588
+ """
589
+
590
+
591
+ PROMPTS_METRICS["context_necessity_verify"] = """
592
+ Compare two answers for semantic equivalence.
593
+
594
+ Ground Truth: {ground_truth}
595
+ Model Answer: {model_answer}
596
+
597
+ ## Output Format
598
+
599
+ ```
600
+ MATCH: YES/NO/PARTIAL
601
+ EXPLANATION: <brief>
602
+ ```
603
+ """
604
+
605
+
606
+ # =============================================================================
607
+ # OPTIMIZED METRICS PROMPTS (Minimal LLM Calls)
608
+ # =============================================================================
609
+
610
+ PROMPTS_METRICS_OPT["prepare_qa"] = """
611
+ Analyze the QA pair and extract evaluation components.
612
+
613
+ QUESTION: {question}
614
+ ANSWER: {answer}
615
+ REFERENCE: {reference}
616
+
617
+ ## Tasks
618
+ 1. Extract concept hops (concept1 --> concept2 --> ...)
619
+ 2. Extract atomic claims from ANSWER
620
+ 3. Extract atomic claims from REFERENCE
621
+ 4. Generate {num_reverse_questions} questions the answer could address
622
+
623
+ ## Output Format
624
+
625
+ ```
626
+ CONCEPT_HOPS_QUESTION:
627
+ concept1 --> concept2 --> ...
628
+
629
+ ANSWER_CLAIMS:
630
+ - [claim 1]
631
+ - [claim 2]
632
+ ...
633
+
634
+ REFERENCE_CLAIMS:
635
+ - [claim 1]
636
+ - [claim 2]
637
+ ...
638
+
639
+ REVERSE_QUESTIONS:
640
+ - [question 1]
641
+ - [question 2]
642
+ ...
643
+ ```
644
+
645
+ [EXAMPLE_PLACEHOLDER: Provide QA analysis example]
646
+ """
647
+
648
+
649
+ PROMPTS_METRICS_OPT["faithfulness"] = """
650
+ Verify if each claim can be inferred from the context.
651
+
652
+ CONTEXT:
653
+ {context}
654
+
655
+ CLAIMS TO VERIFY:
656
+ {claims}
657
+
658
+ ## Output Format (one per line)
659
+
660
+ ```
661
+ CLAIM_1: SUPPORTED/NOT_SUPPORTED
662
+ CLAIM_2: SUPPORTED/NOT_SUPPORTED
663
+ ...
664
+ ```
665
+ """
666
+
667
+
668
+ PROMPTS_METRICS_OPT["context_recall"] = """
669
+ Verify if each reference claim can be attributed to the context.
670
+
671
+ CONTEXT:
672
+ {context}
673
+
674
+ REFERENCE CLAIMS:
675
+ {claims}
676
+
677
+ ## Output Format (one per line)
678
+
679
+ ```
680
+ CLAIM_1: ATTRIBUTED/NOT_ATTRIBUTED
681
+ CLAIM_2: ATTRIBUTED/NOT_ATTRIBUTED
682
+ ...
683
+ ```
684
+ """
685
+
686
+
687
+ PROMPTS_METRICS_OPT["context_precision"] = """
688
+ Evaluate context chunk relevance for answering the question.
689
+
690
+ QUESTION: {question}
691
+ REFERENCE ANSWER: {reference}
692
+
693
+ CONTEXT CHUNKS:
694
+ {contexts}
695
+
696
+ ## Output Format (one per line)
697
+
698
+ ```
699
+ CHUNK_1: RELEVANT/NOT_RELEVANT
700
+ CHUNK_2: RELEVANT/NOT_RELEVANT
701
+ ...
702
+ ```
703
+ """
704
+
705
+
706
+ PROMPTS_METRICS_OPT["multimodal_faithfulness"] = """
707
+ Verify claims against multimodal context (text AND images).
708
+
709
+ QUESTION: {question}
710
+ ANSWER: {answer}
711
+
712
+ CLAIMS TO VERIFY:
713
+ {claims}
714
+
715
+ ## Output Format
716
+
717
+ ```
718
+ CLAIM_1: SUPPORTED/NOT_SUPPORTED | SOURCE: TEXT/IMAGE/BOTH/NONE
719
+ CLAIM_2: SUPPORTED/NOT_SUPPORTED | SOURCE: TEXT/IMAGE/BOTH/NONE
720
+ ...
721
+
722
+ SUMMARY:
723
+ TEXT_GROUNDED: YES/NO
724
+ VISUAL_GROUNDED: YES/NO/NA
725
+ SUPPORTED_COUNT: [number]
726
+ TOTAL_CLAIMS: [number]
727
+ ```
728
+ """
729
+
730
+
731
+ PROMPTS_METRICS_OPT["multimodal_relevance"] = """
732
+ Generate questions the answer could address and evaluate context utilization.
733
+
734
+ ANSWER: {answer}
735
+
736
+ ## Tasks
737
+ 1. Generate {num_questions} diverse questions this answer could address
738
+ 2. Indicate if each uses TEXT, IMAGE, or BOTH context
739
+
740
+ ## Output Format
741
+
742
+ ```
743
+ GENERATED_QUESTIONS:
744
+ Q1: [question] | USES: TEXT/IMAGE/BOTH
745
+ Q2: [question] | USES: TEXT/IMAGE/BOTH
746
+ ...
747
+
748
+ CONTEXT_UTILIZATION:
749
+ USES_TEXT: YES/NO
750
+ USES_IMAGES: YES/NO/NA
751
+ RELEVANCE_SCORE: 0.0-1.0
752
+ ```
753
+ """
754
+
755
+
756
+ PROMPTS_METRICS_OPT["context_necessity_without"] = """
757
+ Answer using ONLY general knowledge. Do NOT fabricate specific facts.
758
+
759
+ If you cannot answer confidently, respond: CANNOT_ANSWER
760
+
761
+ QUESTION: {question}
762
+
763
+ YOUR ANSWER:
764
+ """
765
+
766
+
767
+ PROMPTS_METRICS_OPT["context_necessity_verify"] = """
768
+ Compare model answer to ground truth.
769
+
770
+ GROUND TRUTH: {ground_truth}
771
+ MODEL ANSWER: {model_answer}
772
+
773
+ Respond with exactly one of:
774
+ - MATCH: YES (correct and complete)
775
+ - MATCH: PARTIAL (partially correct)
776
+ - MATCH: NO (incorrect or missing key information)
777
+
778
+ YOUR VERDICT:
779
+ """
780
+
781
+
782
+ PROMPTS_METRICS_OPT["multihop_reasoning"] = """
783
+ Analyze if answering requires multi-hop reasoning.
784
+
785
+ CONTEXTS:
786
+ {contexts}
787
+
788
+ QUESTION: {question}
789
+ ANSWER: {answer}
790
+
791
+ ## Output Format
792
+
793
+ ```
794
+ HOP_COUNT: [number]
795
+ REASONING_SCORE: 0.0-1.0
796
+ BRIDGE_ENTITY: [entity or None]
797
+ EXPLANATION: <brief>
798
+ ```
799
+
800
+ - HOP_COUNT: 1 = single fact, 2+ = multi-hop
801
+ - REASONING_SCORE: 0.0 = trivial, 1.0 = complex
802
+ """
803
+
804
+
805
+ PROMPTS_METRICS_OPT["visual_dependency"] = """
806
+ Determine if the question can be answered from text alone.
807
+
808
+ TEXT CONTEXT:
809
+ {contexts}
810
+
811
+ QUESTION: {question}
812
+
813
+ If you can answer completely from text, provide your answer.
814
+ If visual information is missing and required, respond: MISSING_VISUAL
815
+
816
+ YOUR RESPONSE:
817
+ """
818
+
819
+
820
+ # =============================================================================
821
+ # UTILITY FUNCTIONS
822
+ # =============================================================================
823
+
824
+ def get_prompt(category: str, name: str, **kwargs) -> str:
825
+ """
826
+ Retrieve and format a prompt template.
827
+
828
+ Args:
829
+ category: Prompt category ('prompts', 'desc', 'chunk', 'metrics', 'metrics_opt')
830
+ name: Prompt name within category
831
+ **kwargs: Template variables to substitute
832
+
833
+ Returns:
834
+ Formatted prompt string
835
+ """
836
+ prompt_dicts = {
837
+ 'prompts': PROMPTS,
838
+ 'desc': PROMPTS_DESC,
839
+ 'chunk': PROMPTS_CHUNK,
840
+ 'metrics': PROMPTS_METRICS,
841
+ 'metrics_opt': PROMPTS_METRICS_OPT
842
+ }
843
+
844
+ if category not in prompt_dicts:
845
+ raise ValueError(f"Unknown category: {category}")
846
+
847
+ prompt_dict = prompt_dicts[category]
848
+
849
+ if name not in prompt_dict:
850
+ raise ValueError(f"Unknown prompt: {name} in category {category}")
851
+
852
+ template = prompt_dict[name]
853
+
854
+ if kwargs:
855
+ return template.format(**kwargs)
856
+
857
+ return template
858
+
859
+
860
+ def list_prompts() -> dict[str, list[str]]:
861
+ """List all available prompts by category."""
862
+ return {
863
+ 'prompts': list(PROMPTS.keys()),
864
+ 'desc': list(PROMPTS_DESC.keys()),
865
+ 'chunk': list(PROMPTS_CHUNK.keys()),
866
+ 'metrics': list(PROMPTS_METRICS.keys()),
867
+ 'metrics_opt': list(PROMPTS_METRICS_OPT.keys())
868
+ }
869
+
870
+
871
+ # =============================================================================
872
+ # MODULE INFO
873
+ # =============================================================================
874
+
875
+ __version__ = "1.0.0"
876
+ __all__ = [
877
+ "PROMPTS",
878
+ "PROMPTS_DESC",
879
+ "PROMPTS_CHUNK",
880
+ "PROMPTS_METRICS",
881
+ "PROMPTS_METRICS_OPT",
882
+ "get_prompt",
883
+ "list_prompts"
884
+ ]