sdg-hub 0.1.0a3__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. sdg_hub/_version.py +2 -2
  2. sdg_hub/blocks/__init__.py +35 -5
  3. sdg_hub/blocks/block.py +58 -16
  4. sdg_hub/blocks/llmblock.py +149 -204
  5. sdg_hub/blocks/utilblocks.py +500 -43
  6. sdg_hub/checkpointer.py +139 -0
  7. sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
  8. sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
  9. sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
  10. sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
  11. sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
  12. sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
  13. sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
  14. sdg_hub/configs/skills/contexts.yaml +18 -11
  15. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
  16. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
  17. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
  18. sdg_hub/configs/skills/freeform_questions.yaml +21 -16
  19. sdg_hub/configs/skills/freeform_responses.yaml +19 -25
  20. sdg_hub/configs/skills/router.yaml +53 -6
  21. sdg_hub/flow.py +351 -21
  22. sdg_hub/flow_runner.py +216 -0
  23. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +26 -9
  24. sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
  25. sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
  26. sdg_hub/pipeline.py +67 -12
  27. sdg_hub/prompts.py +26 -0
  28. sdg_hub/sdg.py +128 -86
  29. sdg_hub/utils/config_validation.py +91 -0
  30. sdg_hub/utils/validation_result.py +10 -0
  31. sdg_hub-0.1.1.dist-info/METADATA +190 -0
  32. sdg_hub-0.1.1.dist-info/RECORD +86 -0
  33. {sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/WHEEL +1 -1
  34. sdg_hub/blocks/filterblock.py +0 -76
  35. sdg_hub/blocks/iterblock.py +0 -31
  36. sdg_hub/blocks/rmblocks.py +0 -194
  37. sdg_hub/configs/annotations/simple.yaml +0 -10
  38. sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
  39. sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
  40. sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
  41. sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
  42. sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
  43. sdg_hub/utils/chunking.py +0 -73
  44. sdg_hub/utils/docprocessor.py +0 -357
  45. sdg_hub/utils/parse_and_convert.py +0 -392
  46. sdg_hub-0.1.0a3.dist-info/METADATA +0 -154
  47. sdg_hub-0.1.0a3.dist-info/RECORD +0 -90
  48. /sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
  49. /sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
  50. /sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
  51. /sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
  52. /sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
  53. /sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
  54. /sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
  55. /sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
  56. /sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
  57. /sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
  58. {sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/licenses/LICENSE +0 -0
  59. {sdg_hub-0.1.0a3.dist-info → sdg_hub-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,139 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Standard
3
+ from typing import Optional, List
4
+ import uuid
5
+
6
+ # Third Party
7
+ from datasets import Dataset, load_dataset
8
+ from datasets.data_files import EmptyDatasetError
9
+
10
+ # Local
11
+ from .logger_config import setup_logger
12
+ from .utils.datautils import safe_concatenate_datasets
13
+
14
+ logger = setup_logger(__name__)
15
+
16
+
17
+ class Checkpointer:
18
+ """
19
+ Handles checkpointing functionality for SDG data generation.
20
+ Manages saving intermediate results and loading existing checkpoints.
21
+ """
22
+
23
+ def __init__(self, checkpoint_dir: Optional[str] = None, save_freq: Optional[int] = None):
24
+ """
25
+ Initialize the Checkpointer.
26
+
27
+ Args:
28
+ checkpoint_dir: Directory to save/load checkpoints. If None, checkpointing is disabled.
29
+ save_freq: Frequency for saving intermediate checkpoints during batch processing.
30
+ """
31
+ self.checkpoint_dir = checkpoint_dir
32
+ self.save_freq = save_freq
33
+
34
+ def load_existing_data(self, seed_dataset: Dataset) -> tuple[Dataset, Optional[Dataset]]:
35
+ """
36
+ Load existing checkpoint data and determine what still needs to be generated.
37
+
38
+ Args:
39
+ seed_dataset: Original input dataset
40
+
41
+ Returns:
42
+ Tuple of (remaining_data_to_generate, pre_generated_data)
43
+ If no checkpoints exist, returns (seed_dataset, None)
44
+ """
45
+ if self.checkpoint_dir is None:
46
+ return seed_dataset, None
47
+
48
+ try:
49
+ # Load existing checkpoints
50
+ pre_generated_data = load_dataset(
51
+ "json", data_dir=self.checkpoint_dir, split="train"
52
+ )
53
+ logger.info(
54
+ f"Loading existing checkpoints from {self.checkpoint_dir}, "
55
+ f"with {pre_generated_data.num_rows} rows"
56
+ )
57
+
58
+ # Find missing data that still needs to be generated
59
+ missing_data = self._get_missing_data(seed_dataset, pre_generated_data)
60
+
61
+ if missing_data.num_rows == 0:
62
+ logger.info(
63
+ f"All seed data has been generated, no missing rows found, "
64
+ f"returning data from {self.checkpoint_dir}"
65
+ )
66
+ return missing_data, pre_generated_data
67
+
68
+ logger.info(f"Found {missing_data.num_rows} missing rows in the dataset")
69
+ return missing_data, pre_generated_data
70
+
71
+ except EmptyDatasetError:
72
+ logger.info(
73
+ f"No existing checkpoints found in {self.checkpoint_dir}, "
74
+ f"generating from scratch"
75
+ )
76
+ return seed_dataset, None
77
+
78
+ def _get_missing_data(self, seed_data: Dataset, generated_data: Dataset) -> Dataset:
79
+ """
80
+ Identify rows in seed_data that are not present in generated_data.
81
+
82
+ Args:
83
+ seed_data: Original seed dataset
84
+ generated_data: Previously generated dataset
85
+
86
+ Returns:
87
+ Dataset containing only the missing rows from seed_data
88
+ """
89
+ # Get the common columns between the two datasets
90
+ common_columns = list(
91
+ set(seed_data.column_names) & set(generated_data.column_names)
92
+ )
93
+
94
+ # Extract the relevant data based on common columns
95
+ seed_data_common = seed_data.select_columns(common_columns)
96
+ generated_data_common = generated_data.select_columns(common_columns)
97
+
98
+ # Convert to Pandas DataFrames for easier comparison
99
+ seed_df = seed_data_common.to_pandas()
100
+ generated_df = generated_data_common.to_pandas()
101
+
102
+ # Identify missing rows
103
+ missing_df = seed_df[
104
+ ~seed_df.apply(tuple, 1).isin(generated_df.apply(tuple, 1))
105
+ ]
106
+
107
+ # Convert back to Dataset
108
+ missing_data = Dataset.from_pandas(missing_df, preserve_index=False)
109
+
110
+ return missing_data
111
+
112
+ def save_intermediate_checkpoint(self, dataset: Dataset) -> None:
113
+ """
114
+ Save intermediate checkpoint data to disk.
115
+
116
+ Args:
117
+ dataset: Dataset to save as checkpoint
118
+ """
119
+ if self.checkpoint_dir is None:
120
+ return
121
+
122
+ checkpoint_id = uuid.uuid4().hex
123
+ checkpoint_file = f"{self.checkpoint_dir}/data_checkpoint_{checkpoint_id}.jsonl"
124
+ logger.info(f"Saving checkpoint to {checkpoint_file}")
125
+ dataset.to_json(checkpoint_file, orient="records", lines=True)
126
+
127
+ def should_save_checkpoint(self, current_split_index: int) -> bool:
128
+ """
129
+ Determine if a checkpoint should be saved based on save frequency.
130
+
131
+ Args:
132
+ current_split_index: Current split index (0-based)
133
+
134
+ Returns:
135
+ True if checkpoint should be saved, False otherwise
136
+ """
137
+ if self.save_freq is None or self.checkpoint_dir is None:
138
+ return False
139
+ return (current_split_index + 1) % self.save_freq == 0
@@ -0,0 +1,28 @@
1
+ system: You are an expert text classifier trained to label questions from online forums.
2
+ introduction: "Task Description: You will be given a text and you need to annotate it with one of the following categories: World, Sports, Business, Sci/Tech, Economy"
3
+ principles: |
4
+ Please follow these rules when performing the classification:
5
+ - Focus on the main topic, not peripheral mentions
6
+ - Choose the most specific applicable category
7
+ - Only choose category label per question
8
+ examples: |
9
+ Text: Bangladesh paralysed by strikes Opposition activists have brought many towns and cities in Bangladesh to a halt, the day after 18 people died in explosions at a political rally.
10
+ Category: World
11
+
12
+ Text: Desiring Stability Redskins coach Joe Gibbs expects few major personnel changes in the offseason and wants to instill a culture of stability in Washington.
13
+ Category: Sports
14
+
15
+ Text: A Cosmic Storm: When Galaxy Clusters Collide Astronomers have found what they are calling the perfect cosmic storm, a galaxy cluster pile-up so powerful its energy output is second only to the Big Bang.
16
+ Category: Sci/Tech
17
+
18
+ Text: Economy builds steam in KC Fed district The economy continued to strengthen in September and early October in the Great Plains and Rocky Mountain regions covered by the Tenth Federal Reserve District, the Federal Reserve Bank of Kansas City said Wednesday.
19
+ Category: Economy
20
+
21
+ generation: |
22
+ Here is the query for annotation:
23
+
24
+ Text: {{text}}
25
+ Category:
26
+
27
+ start_tags: [""]
28
+ end_tags: [""]
@@ -0,0 +1,9 @@
1
+ system: null
2
+ introduction: "Task Description: Data Annotation"
3
+ principles: null
4
+ examples: null
5
+ generation: |
6
+ Here is the query for annotation:
7
+ {{text}}
8
+ start_tags: [""]
9
+ end_tags: [""]
@@ -37,6 +37,7 @@ examples: |
37
37
  generation: |
38
38
  Now it's your turn breakdown following snippet from article about {{domain}} into atomic facts following similar style as above examples
39
39
  [Passage]
40
+ {{document_outline}}
40
41
  {{document}}
41
42
  [Facts]
42
43
 
@@ -11,6 +11,7 @@ examples: ""
11
11
 
12
12
  generation: |
13
13
  Document:
14
+ {{document_outline}}
14
15
  {{document}}
15
16
 
16
17
  start_tags: [""]
@@ -11,6 +11,7 @@ examples: ""
11
11
 
12
12
  generation: |
13
13
  Document:
14
+ {{document_outline}}
14
15
  {{document}}
15
16
 
16
17
  start_tags: [""]
@@ -0,0 +1,82 @@
1
+ # This YAML file defines a prompt template for generating educational Q&A pairs from textbook content.
2
+ # The prompt is designed to create comprehensive, domain-specific questions and answers that
3
+ # effectively teach and reinforce key concepts from educational materials.
4
+ #
5
+ # Structure:
6
+ # - system: Sets the AI's role as a knowledgeable assistant
7
+ # - introduction: Main instruction for creating Q&A pairs from textbook chapters
8
+ # - principles: Detailed guidelines for question formulation and educational value
9
+ # - examples: Example Q&A pairs showing expected format and style
10
+ # - generation: Template for the document to be used for Q&A generation
11
+ #
12
+ # Key Features:
13
+ # - Domain-specific question generation (science, legal, etc.)
14
+ # - Multiple difficulty levels and question types
15
+ # - Self-contained questions without external references
16
+ # - Focus on key concepts and learning objectives
17
+ # - Educational value and teaching effectiveness
18
+ #
19
+ # Question Guidelines:
20
+ # - Must be self-contained and independently answerable
21
+ # - Should cover basic recall to advanced comprehension
22
+ # - Include multiple-choice, short answer, and essay types
23
+ # - Align with chapter learning objectives
24
+ # - Avoid references to specific sections or figures
25
+ #
26
+ # Response Format:
27
+ # - Questions and answers are clearly separated
28
+ # - Each response ends with [End] tag
29
+ # - [UNANSWERABLE] for unsuitable content
30
+ #
31
+ # Usage:
32
+ # This prompt is used to generate educational Q&A pairs that effectively teach
33
+ # and reinforce concepts from textbook chapters while maintaining educational
34
+ # value and accessibility.
35
+
36
+ system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
37
+
38
+ introduction: Develop a series of educational questions from a chapter in a {{domain}} textbook.
39
+
40
+ principles: |
41
+ The questions should:
42
+ * Self-contained – understandable without needing to reference tables, figures, or specific text sections.
43
+ * Focus on the provided example and follow the format and style of the provided examples.
44
+ * Relevant to the subject – based on the textbook’s domain (e.g., legal, scientific, etc.).
45
+ * Independently answerable – avoid direct references to theorems, figures, or text numbers.
46
+ * Varied in difficulty - Make difficult same as the provided examples.
47
+ * Use same format as the provided examples.
48
+
49
+ Strictly follow this format for each question your generate while responding
50
+
51
+ [QUESTION]
52
+ <Insert question here>
53
+ [END]
54
+
55
+
56
+ examples: |
57
+ Here are some examples of questions:
58
+
59
+ [Document]
60
+ {{icl_document}}
61
+
62
+ [QUESTION]
63
+ {{icl_query_1}}
64
+ [END]
65
+
66
+ [QUESTION]
67
+ {{icl_query_2}}
68
+ [END]
69
+
70
+ [QUESTION]
71
+ {{icl_query_3}}
72
+ [END]
73
+
74
+ generation: |
75
+ Here is the document:
76
+
77
+ [DOCUMENT]
78
+ {{document_outline}}
79
+ {{document}}
80
+
81
+ start_tags: [""]
82
+ end_tags: [""]
@@ -0,0 +1,86 @@
1
+ # This YAML file defines a prompt template for generating educational Q&A pairs from textbook content.
2
+ # The prompt is designed to create comprehensive, domain-specific questions and answers that
3
+ # effectively teach and reinforce key concepts from educational materials.
4
+ #
5
+ # Structure:
6
+ # - system: Sets the AI's role as a knowledgeable assistant
7
+ # - introduction: Main instruction for creating Q&A pairs from textbook chapters
8
+ # - principles: Detailed guidelines for question formulation and educational value
9
+ # - examples: Example Q&A pairs showing expected format and style
10
+ # - generation: Template for the document to be used for Q&A generation
11
+ #
12
+ # Key Features:
13
+ # - Domain-specific question generation (science, legal, etc.)
14
+ # - Multiple difficulty levels and question types
15
+ # - Self-contained questions without external references
16
+ # - Focus on key concepts and learning objectives
17
+ # - Educational value and teaching effectiveness
18
+ #
19
+ # Question Guidelines:
20
+ # - Must be self-contained and independently answerable
21
+ # - Should cover basic recall to advanced comprehension
22
+ # - Include multiple-choice, short answer, and essay types
23
+ # - Align with chapter learning objectives
24
+ # - Avoid references to specific sections or figures
25
+ #
26
+ # Response Format:
27
+ # - Questions and answers are clearly separated
28
+ # - Each response ends with [End] tag
29
+ # - [UNANSWERABLE] for unsuitable content
30
+ #
31
+ # Usage:
32
+ # This prompt is used to generate educational Q&A pairs that effectively teach
33
+ # and reinforce concepts from textbook chapters while maintaining educational
34
+ # value and accessibility.
35
+
36
+ system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
37
+
38
+ introduction: Answer the question based on the provided document.
39
+
40
+ principles: |
41
+ The answers should:
42
+ * The answer is grounded in the provided document.
43
+ * Follows the format and style of the provided examples.
44
+ * Directly answers the question.
45
+ Strictly follow this format for each question your generate while responding
46
+
47
+ [ANSWER]
48
+ <Insert answer here>
49
+ [END]
50
+
51
+
52
+ examples: |
53
+ Here are some examples of answers for given questions for a document:
54
+
55
+ [Document]
56
+ {{icl_document}}
57
+
58
+ [QUESTION]
59
+ {{icl_query_1}}
60
+
61
+ [ANSWER]
62
+ {{icl_response_1}}
63
+ [END]
64
+
65
+ [QUESTION]
66
+ {{icl_query_2}}
67
+
68
+ [ANSWER]
69
+ {{icl_response_2}}
70
+ [END]
71
+
72
+
73
+ generation: |
74
+ Here is the document:
75
+
76
+ [DOCUMENT]
77
+ {{document_outline}}
78
+ {{document}}
79
+
80
+ [QUESTION]
81
+ {{question}}
82
+
83
+ [ANSWER]
84
+
85
+ start_tags: [""]
86
+ end_tags: [""]
@@ -1,21 +1,28 @@
1
- system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
1
+ system: You are a highly capable AI Assistant that specializes in generating high-quality content tailored to specific tasks.
2
+
3
+ introduction: |
4
+ Your task is to write a rich, relevant, and well-structured **context** for the following task:
5
+ Task Description: {{task_description}}
2
6
 
3
- introduction: You are asked to come up with a diverse context for - {{ task_description }}.
4
7
  principles: |
5
- Please follow these guiding principles when generating responses:
6
- * Use proper grammar and punctuation.
7
- * Always generate safe and respectful content. Do not generate content that is harmful, abusive, or offensive.
8
- * Always generate content that is factually accurate and relevant to the prompt.
9
- * Strictly adhere to the prompt and generate responses in the same style and format as the example.
10
- * Return the context between [Start of Context] and [End of Context] tags.
8
+ Please follow these guiding principles when generating the context:
9
+ * The context should be coherent, informative, and closely aligned with the task description.
10
+ * Do not include any greetings, explanations, or meta commentary.
11
+ * Maintain a natural, human-like tone suitable for the domain.
12
+ * Follow the formatting shown in the example exactly.
13
+ * Wrap the output between the tags: [Start of Context] and [End of Context].
14
+
11
15
  examples: |
12
- To better assist you with this task, here is an example of a context:
16
+ To guide you, here is an example of a well-structured context:
17
+
13
18
  [Start of Context]
14
- {{ seed_context }}
19
+ {{seed_context}}
15
20
  [End of Context]
16
21
 
17
22
  generation: |
18
- Now generate a context paragraph, remember to follow the principles mentioned above and use the same format as the examples. Remember to use the same style and format as the example above. Start your response with the tag [Start of Context] and end it with the tag [End of Context].
23
+ Now generate a new context following the same structure and principles.
24
+ Begin your output with [Start of Context] and end with [End of Context].
25
+ Do not include any additional text outside these tags.
19
26
 
20
27
  start_tags: ["[Start of Context]"]
21
28
  end_tags: ["[End of Context]"]
@@ -1,25 +1,89 @@
1
- system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
1
+ system: You are a highly knowledgeable and impartial AI Assistant tasked with evaluating the quality of responses to user questions.
2
2
 
3
3
  introduction: |
4
- Please act as an impartial judge and evaluate the quality of the answer provided by an AI assistant to the questions displayed below. Evaluate whether or not the answer is a good example of how AI Assistant should respond to the user's instruction. Please assign a score using the following 3-point scale.
4
+ Please act as an objective evaluator and assess whether the AI Assistant's answer correctly follows formatting requirements and answers the user's question. Use the scoring rubric below and assign a score from 1 to 3.
5
+
5
6
  principles: |
6
- 1: It means the answer is incorrect, irrelevant, unsafe or provides incomplete and garbage information. For instance, the answer may be factually wrong, off-topic, or filled with irrelevant content that doesn't address the user's question or it could be incomplete and hanging. It may also include any harmful, unethical, racist, sexist, explicit, offensive, toxic, dangerous, or illegal content.
7
+ Use the following 3-point scale to score the answer:
8
+
9
+ **1 — Poor Quality**
10
+ - The output is incorrectly formatted, contains hallucinations, or ignores required tags/structure.
11
+ - The answer may be off-topic, incomplete, or inconsistent with the task.
12
+ - The output introduces unsafe or inappropriate content, or violates structural instructions.
7
13
 
8
- 2: It means the answer provides the correct answer, but it is brief and to the point without explanations. While it directly answers the user's question, it lacks additional context or in-depth explanations.
14
+ **2 Acceptable but Minimal**
15
+ - The answer is structurally valid and safe, but lacks polish, clarity, or minor formatting correctness.
16
+ - It meets the task minimally, but may show inconsistencies or lack proper use of spacing, tags, or conventions.
17
+ - It's not harmful, but also not ready for use without revision.
9
18
 
10
- 3: It means the answer is a perfect answer from an AI Assistant. It intentionally addresses the user's question with a comprehensive and detailed explanation. It demonstrates expert knowledge in the area, is very well written, logical, easy to follow, engaging, and insightful. And the answer is safe and does not include any harmful content.
19
+ **3 Excellent Answer**
20
+ - The answer is fully correct, clearly written, and **strictly adheres to formatting instructions**.
21
+ - It uses all required tags, markdown syntax, or structure accurately and consistently.
22
+ - It directly fulfills the task with precision, professionalism, and completeness.
11
23
 
12
24
  examples: |
25
+ Example 1 — Score: 1 (Poor Formatting, Unsafe)
26
+
27
+ Task Description: Generate a markdown table of 3 planets with columns: Name, Gravity, and Moons.
28
+
29
+ [Start of Question]
30
+ Create a markdown table with 3 planets and their gravity/moons.
31
+ [End of Question]
32
+
33
+ [Start of Answer]
34
+ Mars - 3.7 - 2; Earth - 9.8 - 1; Jupiter - 24.8 - 79
35
+ [End of Answer]
36
+
37
+ [Start of Evaluation]
38
+ The answer is improperly formatted (not a markdown table) and unstructured. It violates task instructions both structurally and in tone.
39
+ [End of Evaluation]
40
+
41
+ [Start of Score]
42
+ 1
43
+ [End of Score]
44
+
45
+ Example 2 — Score: 2 (Correct but Sloppy Formatting)
46
+
47
+ Task Description: Generate a markdown table of 3 planets with columns: Name, Gravity, and Moons.
48
+
13
49
  [Start of Question]
14
- Take the role of the joker. Now answer this question: What is the name of the largest spider in the world?
50
+ Create a markdown table with 3 planets and their gravity/moons.
15
51
  [End of Question]
16
52
 
53
+ [Start of Answer]
54
+ | Name | Gravity | Moons |
55
+ |-------|---------|-------|
56
+ | Mars | 3.7 | 2 |
57
+ | Earth| 9.8| 1 |
58
+ | Jupiter |24.8 |79|
59
+ [End of Answer]
60
+
17
61
  [Start of Evaluation]
18
- This question is properly formatted, respectful, and relevant to the task of understanding the benefits of renewable energy. It is grounded in the context of renewable energy benefits and focuses on the economic aspect.
62
+ The table has correct content but inconsistent spacing and pipe alignment. It fulfills the task but lacks polish and readability. It's usable, but not clean or well-structured.
19
63
  [End of Evaluation]
20
64
 
65
+ [Start of Score]
66
+ 2
67
+ [End of Score]
68
+
69
+ Example 3 — Score: 3 (Flawless Markdown Table)
70
+
71
+ Task Description: Generate a markdown table of 3 planets with columns: Name, Gravity, and Moons.
72
+
73
+ [Start of Question]
74
+ Create a markdown table with 3 planets and their gravity/moons.
75
+ [End of Question]
76
+
77
+ [Start of Answer]
78
+ | Name | Gravity (m/s²) | Moons |
79
+ |----------|----------------|-------|
80
+ | Mars | 3.7 | 2 |
81
+ | Earth | 9.8 | 1 |
82
+ | Jupiter | 24.8 | 79 |
83
+ [End of Answer]
84
+
21
85
  [Start of Evaluation]
22
- The answer provided is correct and relevant. It accurately identifies the Goliath birdeater spider as the largest spider in the world by size, with a leg span of up to 12 inches. The response is engaging, humorous, and provides additional context about the spider's diet and nature, enhancing the reader's understanding. It avoids any harmful or inappropriate content, aligning well with the rubric's criteria for a comprehensive and detailed explanation.
86
+ The answer uses proper markdown syntax, alignment, and column headers. The formatting is clean, readable, and consistent with markdown table standards. It meets the task precisely.
23
87
  [End of Evaluation]
24
88
 
25
89
  [Start of Score]
@@ -27,7 +91,10 @@ examples: |
27
91
  [End of Score]
28
92
 
29
93
  generation: |
30
- Here's the question and the answer you need to evaluate:
94
+ Now begin your evaluation of the following QA pair. Use the rubric above and be objective and concise in your reasoning.
95
+
96
+ Task Description: {{task_description}}
97
+
31
98
  [Start of Question]
32
99
  {{ question }}
33
100
  [End of Question]
@@ -36,9 +103,9 @@ generation: |
36
103
  {{ response }}
37
104
  [End of Answer]
38
105
 
39
- Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the answer on a scale of 1 to 3 as mentioned above.
40
- * Return the evaluation between [Start of Evaluation] and [End of Evaluation] tags.
41
- * Return the score between [Start of Score] and [End of Score] tags.
106
+ * Provide your evaluation between [Start of Evaluation] and [End of Evaluation] tags.
107
+ * Provide the score between [Start of Score] and [End of Score] tags.
108
+ * Do not include any content outside these tags.
42
109
 
43
110
  start_tags: ["[Start of Evaluation]", "[Start of Score]"]
44
111
  end_tags: ["[End of Evaluation]", "[End of Score]"]
@@ -1,46 +1,78 @@
1
1
  system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
2
2
 
3
3
  introduction: |
4
- Please act as an impartial judge and evaluate the synthetic questions. Evaluate whether or not the question is a good question based on the requirements provided below. Please assign a score using a binary 0/1 scale.
4
+ Please act as an impartial and detail-oriented evaluator of synthetic questions. Your job is to assess whether the given question meets the defined quality and formatting standards for the task described. Assign a score using a strict binary 0/1 scale.
5
5
 
6
6
  principles: |
7
- Here are the requirements:
8
- * The questions should be answerable through text. It should not require any visual or audio output.
9
- * The questions should be in English.
10
- * The questions should be 1 to 2 sentences long and should be properly formatted.
11
- * The question should not be offensive, abusive, or harmful. It should be safe and respectful.
12
- * The question should be relevant to the task given - {{ task_description }}.
7
+ A valid question must satisfy **all** of the following requirements:
8
+ * The question should be answerable via text (not require visual/audio input).
9
+ * It must be **clearly relevant to the task description** ({{task_description}}).
10
+ * It should **not contain placeholder text**, incomplete sentences, or formatting artifacts.
13
11
 
14
- If the question meets the above requirements, please rate it 1. If not, please rate it 0.
12
+ If the question satisfies **all** of the above, assign a score of `1`. Otherwise, assign `0`.
15
13
 
16
14
  examples: |
17
- Task Description: Understanding the benefits of renewable energy
15
+ Example 1 - Valid question:
18
16
 
19
- [Start of Question]
20
- What are the long-term economic benefits of investing in solar energy infrastructure?
21
- [End of Question]
17
+ Task Description: Extract the main idea of a paragraph.
22
18
 
23
- [Start of Evaluation]
24
- This question is properly formatted, respectful, and relevant to the task of understanding the benefits of renewable energy. It is grounded in the context of renewable energy benefits and focuses on the economic aspect.
25
- [End of Evaluation]
26
-
27
- [Start of Score]
28
- 1
29
- [End of Score]
19
+ [Start of Question]
20
+ What is the central message conveyed by the paragraph?
21
+ [End of Question]
30
22
 
23
+ [Start of Evaluation]
24
+ The question is clear, concise, grammatically correct, and directly related to the task. It follows formatting rules and is appropriate in tone.
25
+ [End of Evaluation]
31
26
 
32
- generation: |
33
- Here's the question you need to evaluate:
27
+ [Start of Score]
28
+ 1
29
+ [End of Score]
34
30
 
35
- Task Description: {{ task_description }}
31
+ Example 2 - Invalid question (bad formatting):
36
32
 
37
- [Start of Question]
38
- {{ question }}
39
- [End of Question]
33
+ Task Description: Extract the main idea of a paragraph.
40
34
 
41
- Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the question on a scale of 0 or 1 as mentioned above. Strictly follow the format below:
42
- * Return the evaluation between [Start of Evaluation] and [End of Evaluation] tags.
43
- * Return the score using a binary 0/1 scale between [Start of Score] and [End of Score] tags.
35
+ [Start of Question]
36
+ main idea??
37
+ [End of Question]
38
+
39
+ [Start of Evaluation]
40
+ The question lacks proper capitalization, punctuation, and complete sentence structure. It does not meet the formatting standards.
41
+ [End of Evaluation]
42
+
43
+ [Start of Score]
44
+ 0
45
+ [End of Score]
46
+
47
+ Example 3 - Invalid question (off-topic):
48
+
49
+ Task Description: Extract the main idea of a paragraph.
50
+
51
+ [Start of Question]
52
+ What's your favorite type of movie and why?
53
+ [End of Question]
54
+
55
+ [Start of Evaluation]
56
+ The question is unrelated to the given task description. It fails the relevance requirement.
57
+ [End of Evaluation]
58
+
59
+ [Start of Score]
60
+ 0
61
+ [End of Score]
62
+
63
+ generation: |
64
+ Here's the question you need to evaluate:
65
+
66
+ Task Description: {{task_description}}
67
+
68
+ [Start of Question]
69
+ {{question}}
70
+ [End of Question]
71
+
72
+ Now begin your evaluation:
73
+ * First, provide a brief explanation between [Start of Evaluation] and [End of Evaluation] tags.
74
+ * Then return a binary score (0 or 1) between [Start of Score] and [End of Score] tags.
75
+ * Do not include any content outside these tags.
44
76
 
45
77
  start_tags: ["[Start of Evaluation]", "[Start of Score]"]
46
78
  end_tags: ["[End of Evaluation]", "[End of Score]"]