sdg-hub 0.1.0a4__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/_version.py +2 -2
- sdg_hub/blocks/__init__.py +35 -5
- sdg_hub/blocks/block.py +58 -16
- sdg_hub/blocks/llmblock.py +121 -193
- sdg_hub/blocks/utilblocks.py +500 -43
- sdg_hub/checkpointer.py +139 -0
- sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
- sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +1 -0
- sdg_hub/configs/knowledge/detailed_summary.yaml +1 -0
- sdg_hub/configs/knowledge/extractive_summary.yaml +1 -0
- sdg_hub/configs/knowledge/generate_questions.yaml +82 -0
- sdg_hub/configs/knowledge/generate_responses.yaml +86 -0
- sdg_hub/configs/skills/contexts.yaml +18 -11
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +79 -12
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +60 -28
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +95 -30
- sdg_hub/configs/skills/freeform_questions.yaml +21 -16
- sdg_hub/configs/skills/freeform_responses.yaml +19 -25
- sdg_hub/configs/skills/router.yaml +53 -6
- sdg_hub/flow.py +351 -21
- sdg_hub/flow_runner.py +216 -0
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +26 -9
- sdg_hub/flows/generation/skills/{agentic_improve_skill.yaml → improve_responses.yaml} +26 -31
- sdg_hub/flows/generation/skills/synth_skills.yaml +4 -4
- sdg_hub/pipeline.py +67 -12
- sdg_hub/prompts.py +21 -0
- sdg_hub/sdg.py +128 -86
- sdg_hub/utils/config_validation.py +91 -0
- sdg_hub/utils/validation_result.py +10 -0
- sdg_hub-0.1.1.dist-info/METADATA +190 -0
- sdg_hub-0.1.1.dist-info/RECORD +86 -0
- {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.1.dist-info}/WHEEL +1 -1
- sdg_hub/blocks/filterblock.py +0 -76
- sdg_hub/blocks/iterblock.py +0 -31
- sdg_hub/blocks/rmblocks.py +0 -194
- sdg_hub/configs/annotations/simple.yaml +0 -10
- sdg_hub/configs/knowledge/data_recipe/default_recipe.yaml +0 -3
- sdg_hub/configs/skills/data_recipe/default_recipe.yaml +0 -6
- sdg_hub/flows/annotation/emotion/detailed_description.yaml +0 -19
- sdg_hub/flows/annotation/emotion/detailed_description_icl.yaml +0 -19
- sdg_hub/flows/annotation/emotion/simple.yaml +0 -19
- sdg_hub/utils/chunking.py +0 -73
- sdg_hub/utils/docprocessor.py +0 -357
- sdg_hub/utils/parse_and_convert.py +0 -392
- sdg_hub-0.1.0a4.dist-info/METADATA +0 -309
- sdg_hub-0.1.0a4.dist-info/RECORD +0 -90
- /sdg_hub/configs/{knowledge/data_recipe → reasoning}/__init__.py +0 -0
- /sdg_hub/configs/skills/{_G_.yaml → icl_examples/STEM.yaml} +0 -0
- /sdg_hub/configs/skills/{data_recipe → icl_examples}/__init__.py +0 -0
- /sdg_hub/configs/skills/{_A_.yaml → icl_examples/coding.yaml} +0 -0
- /sdg_hub/configs/skills/{_B_.yaml → icl_examples/extraction.yaml} +0 -0
- /sdg_hub/configs/skills/{_C_.yaml → icl_examples/humanities.yaml} +0 -0
- /sdg_hub/configs/skills/{_D_.yaml → icl_examples/math.yaml} +0 -0
- /sdg_hub/configs/skills/{_E_.yaml → icl_examples/reasoning.yaml} +0 -0
- /sdg_hub/configs/skills/{_F_.yaml → icl_examples/roleplay.yaml} +0 -0
- /sdg_hub/configs/skills/{_H_.yaml → icl_examples/writing.yaml} +0 -0
- {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.0a4.dist-info → sdg_hub-0.1.1.dist-info}/top_level.txt +0 -0
sdg_hub/checkpointer.py
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
# Standard
|
3
|
+
from typing import Optional, List
|
4
|
+
import uuid
|
5
|
+
|
6
|
+
# Third Party
|
7
|
+
from datasets import Dataset, load_dataset
|
8
|
+
from datasets.data_files import EmptyDatasetError
|
9
|
+
|
10
|
+
# Local
|
11
|
+
from .logger_config import setup_logger
|
12
|
+
from .utils.datautils import safe_concatenate_datasets
|
13
|
+
|
14
|
+
logger = setup_logger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class Checkpointer:
|
18
|
+
"""
|
19
|
+
Handles checkpointing functionality for SDG data generation.
|
20
|
+
Manages saving intermediate results and loading existing checkpoints.
|
21
|
+
"""
|
22
|
+
|
23
|
+
def __init__(self, checkpoint_dir: Optional[str] = None, save_freq: Optional[int] = None):
|
24
|
+
"""
|
25
|
+
Initialize the Checkpointer.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
checkpoint_dir: Directory to save/load checkpoints. If None, checkpointing is disabled.
|
29
|
+
save_freq: Frequency for saving intermediate checkpoints during batch processing.
|
30
|
+
"""
|
31
|
+
self.checkpoint_dir = checkpoint_dir
|
32
|
+
self.save_freq = save_freq
|
33
|
+
|
34
|
+
def load_existing_data(self, seed_dataset: Dataset) -> tuple[Dataset, Optional[Dataset]]:
|
35
|
+
"""
|
36
|
+
Load existing checkpoint data and determine what still needs to be generated.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
seed_dataset: Original input dataset
|
40
|
+
|
41
|
+
Returns:
|
42
|
+
Tuple of (remaining_data_to_generate, pre_generated_data)
|
43
|
+
If no checkpoints exist, returns (seed_dataset, None)
|
44
|
+
"""
|
45
|
+
if self.checkpoint_dir is None:
|
46
|
+
return seed_dataset, None
|
47
|
+
|
48
|
+
try:
|
49
|
+
# Load existing checkpoints
|
50
|
+
pre_generated_data = load_dataset(
|
51
|
+
"json", data_dir=self.checkpoint_dir, split="train"
|
52
|
+
)
|
53
|
+
logger.info(
|
54
|
+
f"Loading existing checkpoints from {self.checkpoint_dir}, "
|
55
|
+
f"with {pre_generated_data.num_rows} rows"
|
56
|
+
)
|
57
|
+
|
58
|
+
# Find missing data that still needs to be generated
|
59
|
+
missing_data = self._get_missing_data(seed_dataset, pre_generated_data)
|
60
|
+
|
61
|
+
if missing_data.num_rows == 0:
|
62
|
+
logger.info(
|
63
|
+
f"All seed data has been generated, no missing rows found, "
|
64
|
+
f"returning data from {self.checkpoint_dir}"
|
65
|
+
)
|
66
|
+
return missing_data, pre_generated_data
|
67
|
+
|
68
|
+
logger.info(f"Found {missing_data.num_rows} missing rows in the dataset")
|
69
|
+
return missing_data, pre_generated_data
|
70
|
+
|
71
|
+
except EmptyDatasetError:
|
72
|
+
logger.info(
|
73
|
+
f"No existing checkpoints found in {self.checkpoint_dir}, "
|
74
|
+
f"generating from scratch"
|
75
|
+
)
|
76
|
+
return seed_dataset, None
|
77
|
+
|
78
|
+
def _get_missing_data(self, seed_data: Dataset, generated_data: Dataset) -> Dataset:
|
79
|
+
"""
|
80
|
+
Identify rows in seed_data that are not present in generated_data.
|
81
|
+
|
82
|
+
Args:
|
83
|
+
seed_data: Original seed dataset
|
84
|
+
generated_data: Previously generated dataset
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
Dataset containing only the missing rows from seed_data
|
88
|
+
"""
|
89
|
+
# Get the common columns between the two datasets
|
90
|
+
common_columns = list(
|
91
|
+
set(seed_data.column_names) & set(generated_data.column_names)
|
92
|
+
)
|
93
|
+
|
94
|
+
# Extract the relevant data based on common columns
|
95
|
+
seed_data_common = seed_data.select_columns(common_columns)
|
96
|
+
generated_data_common = generated_data.select_columns(common_columns)
|
97
|
+
|
98
|
+
# Convert to Pandas DataFrames for easier comparison
|
99
|
+
seed_df = seed_data_common.to_pandas()
|
100
|
+
generated_df = generated_data_common.to_pandas()
|
101
|
+
|
102
|
+
# Identify missing rows
|
103
|
+
missing_df = seed_df[
|
104
|
+
~seed_df.apply(tuple, 1).isin(generated_df.apply(tuple, 1))
|
105
|
+
]
|
106
|
+
|
107
|
+
# Convert back to Dataset
|
108
|
+
missing_data = Dataset.from_pandas(missing_df, preserve_index=False)
|
109
|
+
|
110
|
+
return missing_data
|
111
|
+
|
112
|
+
def save_intermediate_checkpoint(self, dataset: Dataset) -> None:
|
113
|
+
"""
|
114
|
+
Save intermediate checkpoint data to disk.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
dataset: Dataset to save as checkpoint
|
118
|
+
"""
|
119
|
+
if self.checkpoint_dir is None:
|
120
|
+
return
|
121
|
+
|
122
|
+
checkpoint_id = uuid.uuid4().hex
|
123
|
+
checkpoint_file = f"{self.checkpoint_dir}/data_checkpoint_{checkpoint_id}.jsonl"
|
124
|
+
logger.info(f"Saving checkpoint to {checkpoint_file}")
|
125
|
+
dataset.to_json(checkpoint_file, orient="records", lines=True)
|
126
|
+
|
127
|
+
def should_save_checkpoint(self, current_split_index: int) -> bool:
|
128
|
+
"""
|
129
|
+
Determine if a checkpoint should be saved based on save frequency.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
current_split_index: Current split index (0-based)
|
133
|
+
|
134
|
+
Returns:
|
135
|
+
True if checkpoint should be saved, False otherwise
|
136
|
+
"""
|
137
|
+
if self.save_freq is None or self.checkpoint_dir is None:
|
138
|
+
return False
|
139
|
+
return (current_split_index + 1) % self.save_freq == 0
|
@@ -0,0 +1,28 @@
|
|
1
|
+
system: You are an expert text classifier trained to label questions from online forums.
|
2
|
+
introduction: "Task Description: You will be given a text and you need to annotate it with one of the following categories: World, Sports, Business, Sci/Tech, Economy"
|
3
|
+
principles: |
|
4
|
+
Please follow these rules when performing the classification:
|
5
|
+
- Focus on the main topic, not peripheral mentions
|
6
|
+
- Choose the most specific applicable category
|
7
|
+
- Only choose category label per question
|
8
|
+
examples: |
|
9
|
+
Text: Bangladesh paralysed by strikes Opposition activists have brought many towns and cities in Bangladesh to a halt, the day after 18 people died in explosions at a political rally.
|
10
|
+
Category: World
|
11
|
+
|
12
|
+
Text: Desiring Stability Redskins coach Joe Gibbs expects few major personnel changes in the offseason and wants to instill a culture of stability in Washington.
|
13
|
+
Category: Sports
|
14
|
+
|
15
|
+
Text: A Cosmic Storm: When Galaxy Clusters Collide Astronomers have found what they are calling the perfect cosmic storm, a galaxy cluster pile-up so powerful its energy output is second only to the Big Bang.
|
16
|
+
Category: Sci/Tech
|
17
|
+
|
18
|
+
Text: Economy builds steam in KC Fed district The economy continued to strengthen in September and early October in the Great Plains and Rocky Mountain regions covered by the Tenth Federal Reserve District, the Federal Reserve Bank of Kansas City said Wednesday.
|
19
|
+
Category: Economy
|
20
|
+
|
21
|
+
generation: |
|
22
|
+
Here is the query for annotation:
|
23
|
+
|
24
|
+
Text: {{text}}
|
25
|
+
Category:
|
26
|
+
|
27
|
+
start_tags: [""]
|
28
|
+
end_tags: [""]
|
@@ -0,0 +1,82 @@
|
|
1
|
+
# This YAML file defines a prompt template for generating educational Q&A pairs from textbook content.
|
2
|
+
# The prompt is designed to create comprehensive, domain-specific questions and answers that
|
3
|
+
# effectively teach and reinforce key concepts from educational materials.
|
4
|
+
#
|
5
|
+
# Structure:
|
6
|
+
# - system: Sets the AI's role as a knowledgeable assistant
|
7
|
+
# - introduction: Main instruction for creating Q&A pairs from textbook chapters
|
8
|
+
# - principles: Detailed guidelines for question formulation and educational value
|
9
|
+
# - examples: Example Q&A pairs showing expected format and style
|
10
|
+
# - generation: Template for the document to be used for Q&A generation
|
11
|
+
#
|
12
|
+
# Key Features:
|
13
|
+
# - Domain-specific question generation (science, legal, etc.)
|
14
|
+
# - Multiple difficulty levels and question types
|
15
|
+
# - Self-contained questions without external references
|
16
|
+
# - Focus on key concepts and learning objectives
|
17
|
+
# - Educational value and teaching effectiveness
|
18
|
+
#
|
19
|
+
# Question Guidelines:
|
20
|
+
# - Must be self-contained and independently answerable
|
21
|
+
# - Should cover basic recall to advanced comprehension
|
22
|
+
# - Include multiple-choice, short answer, and essay types
|
23
|
+
# - Align with chapter learning objectives
|
24
|
+
# - Avoid references to specific sections or figures
|
25
|
+
#
|
26
|
+
# Response Format:
|
27
|
+
# - Questions and answers are clearly separated
|
28
|
+
# - Each response ends with [End] tag
|
29
|
+
# - [UNANSWERABLE] for unsuitable content
|
30
|
+
#
|
31
|
+
# Usage:
|
32
|
+
# This prompt is used to generate educational Q&A pairs that effectively teach
|
33
|
+
# and reinforce concepts from textbook chapters while maintaining educational
|
34
|
+
# value and accessibility.
|
35
|
+
|
36
|
+
system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
|
37
|
+
|
38
|
+
introduction: Develop a series of educational questions from a chapter in a {{domain}} textbook.
|
39
|
+
|
40
|
+
principles: |
|
41
|
+
The questions should:
|
42
|
+
* Self-contained – understandable without needing to reference tables, figures, or specific text sections.
|
43
|
+
* Focus on the provided example and follow the format and style of the provided examples.
|
44
|
+
* Relevant to the subject – based on the textbook’s domain (e.g., legal, scientific, etc.).
|
45
|
+
* Independently answerable – avoid direct references to theorems, figures, or text numbers.
|
46
|
+
* Varied in difficulty - Make difficult same as the provided examples.
|
47
|
+
* Use same format as the provided examples.
|
48
|
+
|
49
|
+
Strictly follow this format for each question your generate while responding
|
50
|
+
|
51
|
+
[QUESTION]
|
52
|
+
<Insert question here>
|
53
|
+
[END]
|
54
|
+
|
55
|
+
|
56
|
+
examples: |
|
57
|
+
Here are some examples of questions:
|
58
|
+
|
59
|
+
[Document]
|
60
|
+
{{icl_document}}
|
61
|
+
|
62
|
+
[QUESTION]
|
63
|
+
{{icl_query_1}}
|
64
|
+
[END]
|
65
|
+
|
66
|
+
[QUESTION]
|
67
|
+
{{icl_query_2}}
|
68
|
+
[END]
|
69
|
+
|
70
|
+
[QUESTION]
|
71
|
+
{{icl_query_3}}
|
72
|
+
[END]
|
73
|
+
|
74
|
+
generation: |
|
75
|
+
Here is the document:
|
76
|
+
|
77
|
+
[DOCUMENT]
|
78
|
+
{{document_outline}}
|
79
|
+
{{document}}
|
80
|
+
|
81
|
+
start_tags: [""]
|
82
|
+
end_tags: [""]
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# This YAML file defines a prompt template for generating educational Q&A pairs from textbook content.
|
2
|
+
# The prompt is designed to create comprehensive, domain-specific questions and answers that
|
3
|
+
# effectively teach and reinforce key concepts from educational materials.
|
4
|
+
#
|
5
|
+
# Structure:
|
6
|
+
# - system: Sets the AI's role as a knowledgeable assistant
|
7
|
+
# - introduction: Main instruction for creating Q&A pairs from textbook chapters
|
8
|
+
# - principles: Detailed guidelines for question formulation and educational value
|
9
|
+
# - examples: Example Q&A pairs showing expected format and style
|
10
|
+
# - generation: Template for the document to be used for Q&A generation
|
11
|
+
#
|
12
|
+
# Key Features:
|
13
|
+
# - Domain-specific question generation (science, legal, etc.)
|
14
|
+
# - Multiple difficulty levels and question types
|
15
|
+
# - Self-contained questions without external references
|
16
|
+
# - Focus on key concepts and learning objectives
|
17
|
+
# - Educational value and teaching effectiveness
|
18
|
+
#
|
19
|
+
# Question Guidelines:
|
20
|
+
# - Must be self-contained and independently answerable
|
21
|
+
# - Should cover basic recall to advanced comprehension
|
22
|
+
# - Include multiple-choice, short answer, and essay types
|
23
|
+
# - Align with chapter learning objectives
|
24
|
+
# - Avoid references to specific sections or figures
|
25
|
+
#
|
26
|
+
# Response Format:
|
27
|
+
# - Questions and answers are clearly separated
|
28
|
+
# - Each response ends with [End] tag
|
29
|
+
# - [UNANSWERABLE] for unsuitable content
|
30
|
+
#
|
31
|
+
# Usage:
|
32
|
+
# This prompt is used to generate educational Q&A pairs that effectively teach
|
33
|
+
# and reinforce concepts from textbook chapters while maintaining educational
|
34
|
+
# value and accessibility.
|
35
|
+
|
36
|
+
system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
|
37
|
+
|
38
|
+
introduction: Answer the question based on the provided document.
|
39
|
+
|
40
|
+
principles: |
|
41
|
+
The answers should:
|
42
|
+
* The answer is grounded in the provided document.
|
43
|
+
* Follows the format and style of the provided examples.
|
44
|
+
* Directly answers the question.
|
45
|
+
Strictly follow this format for each question your generate while responding
|
46
|
+
|
47
|
+
[ANSWER]
|
48
|
+
<Insert answer here>
|
49
|
+
[END]
|
50
|
+
|
51
|
+
|
52
|
+
examples: |
|
53
|
+
Here are some examples of answers for given questions for a document:
|
54
|
+
|
55
|
+
[Document]
|
56
|
+
{{icl_document}}
|
57
|
+
|
58
|
+
[QUESTION]
|
59
|
+
{{icl_query_1}}
|
60
|
+
|
61
|
+
[ANSWER]
|
62
|
+
{{icl_response_1}}
|
63
|
+
[END]
|
64
|
+
|
65
|
+
[QUESTION]
|
66
|
+
{{icl_query_2}}
|
67
|
+
|
68
|
+
[ANSWER]
|
69
|
+
{{icl_response_2}}
|
70
|
+
[END]
|
71
|
+
|
72
|
+
|
73
|
+
generation: |
|
74
|
+
Here is the document:
|
75
|
+
|
76
|
+
[DOCUMENT]
|
77
|
+
{{document_outline}}
|
78
|
+
{{document}}
|
79
|
+
|
80
|
+
[QUESTION]
|
81
|
+
{{question}}
|
82
|
+
|
83
|
+
[ANSWER]
|
84
|
+
|
85
|
+
start_tags: [""]
|
86
|
+
end_tags: [""]
|
@@ -1,21 +1,28 @@
|
|
1
|
-
system: You are a
|
1
|
+
system: You are a highly capable AI Assistant that specializes in generating high-quality content tailored to specific tasks.
|
2
|
+
|
3
|
+
introduction: |
|
4
|
+
Your task is to write a rich, relevant, and well-structured **context** for the following task:
|
5
|
+
Task Description: {{task_description}}
|
2
6
|
|
3
|
-
introduction: You are asked to come up with a diverse context for - {{ task_description }}.
|
4
7
|
principles: |
|
5
|
-
Please follow these guiding principles when generating
|
6
|
-
*
|
7
|
-
*
|
8
|
-
*
|
9
|
-
*
|
10
|
-
*
|
8
|
+
Please follow these guiding principles when generating the context:
|
9
|
+
* The context should be coherent, informative, and closely aligned with the task description.
|
10
|
+
* Do not include any greetings, explanations, or meta commentary.
|
11
|
+
* Maintain a natural, human-like tone suitable for the domain.
|
12
|
+
* Follow the formatting shown in the example exactly.
|
13
|
+
* Wrap the output between the tags: [Start of Context] and [End of Context].
|
14
|
+
|
11
15
|
examples: |
|
12
|
-
To
|
16
|
+
To guide you, here is an example of a well-structured context:
|
17
|
+
|
13
18
|
[Start of Context]
|
14
|
-
{{
|
19
|
+
{{seed_context}}
|
15
20
|
[End of Context]
|
16
21
|
|
17
22
|
generation: |
|
18
|
-
Now generate a context
|
23
|
+
Now generate a new context following the same structure and principles.
|
24
|
+
Begin your output with [Start of Context] and end with [End of Context].
|
25
|
+
Do not include any additional text outside these tags.
|
19
26
|
|
20
27
|
start_tags: ["[Start of Context]"]
|
21
28
|
end_tags: ["[End of Context]"]
|
@@ -1,25 +1,89 @@
|
|
1
|
-
system: You are a
|
1
|
+
system: You are a highly knowledgeable and impartial AI Assistant tasked with evaluating the quality of responses to user questions.
|
2
2
|
|
3
3
|
introduction: |
|
4
|
-
Please act as an
|
4
|
+
Please act as an objective evaluator and assess whether the AI Assistant's answer correctly follows formatting requirements and answers the user's question. Use the scoring rubric below and assign a score from 1 to 3.
|
5
|
+
|
5
6
|
principles: |
|
6
|
-
|
7
|
+
Use the following 3-point scale to score the answer:
|
8
|
+
|
9
|
+
**1 — Poor Quality**
|
10
|
+
- The output is incorrectly formatted, contains hallucinations, or ignores required tags/structure.
|
11
|
+
- The answer may be off-topic, incomplete, or inconsistent with the task.
|
12
|
+
- The output introduces unsafe or inappropriate content, or violates structural instructions.
|
7
13
|
|
8
|
-
2
|
14
|
+
**2 — Acceptable but Minimal**
|
15
|
+
- The answer is structurally valid and safe, but lacks polish, clarity, or minor formatting correctness.
|
16
|
+
- It meets the task minimally, but may show inconsistencies or lack proper use of spacing, tags, or conventions.
|
17
|
+
- It's not harmful, but also not ready for use without revision.
|
9
18
|
|
10
|
-
3
|
19
|
+
**3 — Excellent Answer**
|
20
|
+
- The answer is fully correct, clearly written, and **strictly adheres to formatting instructions**.
|
21
|
+
- It uses all required tags, markdown syntax, or structure accurately and consistently.
|
22
|
+
- It directly fulfills the task with precision, professionalism, and completeness.
|
11
23
|
|
12
24
|
examples: |
|
25
|
+
Example 1 — Score: 1 (Poor Formatting, Unsafe)
|
26
|
+
|
27
|
+
Task Description: Generate a markdown table of 3 planets with columns: Name, Gravity, and Moons.
|
28
|
+
|
29
|
+
[Start of Question]
|
30
|
+
Create a markdown table with 3 planets and their gravity/moons.
|
31
|
+
[End of Question]
|
32
|
+
|
33
|
+
[Start of Answer]
|
34
|
+
Mars - 3.7 - 2; Earth - 9.8 - 1; Jupiter - 24.8 - 79
|
35
|
+
[End of Answer]
|
36
|
+
|
37
|
+
[Start of Evaluation]
|
38
|
+
The answer is improperly formatted (not a markdown table) and unstructured. It violates task instructions both structurally and in tone.
|
39
|
+
[End of Evaluation]
|
40
|
+
|
41
|
+
[Start of Score]
|
42
|
+
1
|
43
|
+
[End of Score]
|
44
|
+
|
45
|
+
Example 2 — Score: 2 (Correct but Sloppy Formatting)
|
46
|
+
|
47
|
+
Task Description: Generate a markdown table of 3 planets with columns: Name, Gravity, and Moons.
|
48
|
+
|
13
49
|
[Start of Question]
|
14
|
-
|
50
|
+
Create a markdown table with 3 planets and their gravity/moons.
|
15
51
|
[End of Question]
|
16
52
|
|
53
|
+
[Start of Answer]
|
54
|
+
| Name | Gravity | Moons |
|
55
|
+
|-------|---------|-------|
|
56
|
+
| Mars | 3.7 | 2 |
|
57
|
+
| Earth| 9.8| 1 |
|
58
|
+
| Jupiter |24.8 |79|
|
59
|
+
[End of Answer]
|
60
|
+
|
17
61
|
[Start of Evaluation]
|
18
|
-
|
62
|
+
The table has correct content but inconsistent spacing and pipe alignment. It fulfills the task but lacks polish and readability. It's usable, but not clean or well-structured.
|
19
63
|
[End of Evaluation]
|
20
64
|
|
65
|
+
[Start of Score]
|
66
|
+
2
|
67
|
+
[End of Score]
|
68
|
+
|
69
|
+
Example 3 — Score: 3 (Flawless Markdown Table)
|
70
|
+
|
71
|
+
Task Description: Generate a markdown table of 3 planets with columns: Name, Gravity, and Moons.
|
72
|
+
|
73
|
+
[Start of Question]
|
74
|
+
Create a markdown table with 3 planets and their gravity/moons.
|
75
|
+
[End of Question]
|
76
|
+
|
77
|
+
[Start of Answer]
|
78
|
+
| Name | Gravity (m/s²) | Moons |
|
79
|
+
|----------|----------------|-------|
|
80
|
+
| Mars | 3.7 | 2 |
|
81
|
+
| Earth | 9.8 | 1 |
|
82
|
+
| Jupiter | 24.8 | 79 |
|
83
|
+
[End of Answer]
|
84
|
+
|
21
85
|
[Start of Evaluation]
|
22
|
-
The answer
|
86
|
+
The answer uses proper markdown syntax, alignment, and column headers. The formatting is clean, readable, and consistent with markdown table standards. It meets the task precisely.
|
23
87
|
[End of Evaluation]
|
24
88
|
|
25
89
|
[Start of Score]
|
@@ -27,7 +91,10 @@ examples: |
|
|
27
91
|
[End of Score]
|
28
92
|
|
29
93
|
generation: |
|
30
|
-
|
94
|
+
Now begin your evaluation of the following QA pair. Use the rubric above and be objective and concise in your reasoning.
|
95
|
+
|
96
|
+
Task Description: {{task_description}}
|
97
|
+
|
31
98
|
[Start of Question]
|
32
99
|
{{ question }}
|
33
100
|
[End of Question]
|
@@ -36,9 +103,9 @@ generation: |
|
|
36
103
|
{{ response }}
|
37
104
|
[End of Answer]
|
38
105
|
|
39
|
-
|
40
|
-
*
|
41
|
-
*
|
106
|
+
* Provide your evaluation between [Start of Evaluation] and [End of Evaluation] tags.
|
107
|
+
* Provide the score between [Start of Score] and [End of Score] tags.
|
108
|
+
* Do not include any content outside these tags.
|
42
109
|
|
43
110
|
start_tags: ["[Start of Evaluation]", "[Start of Score]"]
|
44
111
|
end_tags: ["[End of Evaluation]", "[End of Score]"]
|
@@ -1,46 +1,78 @@
|
|
1
1
|
system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
|
2
2
|
|
3
3
|
introduction: |
|
4
|
-
|
4
|
+
Please act as an impartial and detail-oriented evaluator of synthetic questions. Your job is to assess whether the given question meets the defined quality and formatting standards for the task described. Assign a score using a strict binary 0/1 scale.
|
5
5
|
|
6
6
|
principles: |
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
* The question should not be offensive, abusive, or harmful. It should be safe and respectful.
|
12
|
-
* The question should be relevant to the task given - {{ task_description }}.
|
7
|
+
A valid question must satisfy **all** of the following requirements:
|
8
|
+
* The question should be answerable via text (not require visual/audio input).
|
9
|
+
* It must be **clearly relevant to the task description** ({{task_description}}).
|
10
|
+
* It should **not contain placeholder text**, incomplete sentences, or formatting artifacts.
|
13
11
|
|
14
|
-
|
12
|
+
If the question satisfies **all** of the above, assign a score of `1`. Otherwise, assign `0`.
|
15
13
|
|
16
14
|
examples: |
|
17
|
-
|
15
|
+
Example 1 - Valid question:
|
18
16
|
|
19
|
-
|
20
|
-
What are the long-term economic benefits of investing in solar energy infrastructure?
|
21
|
-
[End of Question]
|
17
|
+
Task Description: Extract the main idea of a paragraph.
|
22
18
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
[Start of Score]
|
28
|
-
1
|
29
|
-
[End of Score]
|
19
|
+
[Start of Question]
|
20
|
+
What is the central message conveyed by the paragraph?
|
21
|
+
[End of Question]
|
30
22
|
|
23
|
+
[Start of Evaluation]
|
24
|
+
The question is clear, concise, grammatically correct, and directly related to the task. It follows formatting rules and is appropriate in tone.
|
25
|
+
[End of Evaluation]
|
31
26
|
|
32
|
-
|
33
|
-
|
27
|
+
[Start of Score]
|
28
|
+
1
|
29
|
+
[End of Score]
|
34
30
|
|
35
|
-
|
31
|
+
Example 2 - Invalid question (bad formatting):
|
36
32
|
|
37
|
-
|
38
|
-
{{ question }}
|
39
|
-
[End of Question]
|
33
|
+
Task Description: Extract the main idea of a paragraph.
|
40
34
|
|
41
|
-
|
42
|
-
|
43
|
-
|
35
|
+
[Start of Question]
|
36
|
+
main idea??
|
37
|
+
[End of Question]
|
38
|
+
|
39
|
+
[Start of Evaluation]
|
40
|
+
The question lacks proper capitalization, punctuation, and complete sentence structure. It does not meet the formatting standards.
|
41
|
+
[End of Evaluation]
|
42
|
+
|
43
|
+
[Start of Score]
|
44
|
+
0
|
45
|
+
[End of Score]
|
46
|
+
|
47
|
+
Example 3 - Invalid question (off-topic):
|
48
|
+
|
49
|
+
Task Description: Extract the main idea of a paragraph.
|
50
|
+
|
51
|
+
[Start of Question]
|
52
|
+
What's your favorite type of movie and why?
|
53
|
+
[End of Question]
|
54
|
+
|
55
|
+
[Start of Evaluation]
|
56
|
+
The question is unrelated to the given task description. It fails the relevance requirement.
|
57
|
+
[End of Evaluation]
|
58
|
+
|
59
|
+
[Start of Score]
|
60
|
+
0
|
61
|
+
[End of Score]
|
62
|
+
|
63
|
+
generation: |
|
64
|
+
Here's the question you need to evaluate:
|
65
|
+
|
66
|
+
Task Description: {{task_description}}
|
67
|
+
|
68
|
+
[Start of Question]
|
69
|
+
{{question}}
|
70
|
+
[End of Question]
|
71
|
+
|
72
|
+
Now begin your evaluation:
|
73
|
+
* First, provide a brief explanation between [Start of Evaluation] and [End of Evaluation] tags.
|
74
|
+
* Then return a binary score (0 or 1) between [Start of Score] and [End of Score] tags.
|
75
|
+
* Do not include any content outside these tags.
|
44
76
|
|
45
77
|
start_tags: ["[Start of Evaluation]", "[Start of Score]"]
|
46
78
|
end_tags: ["[End of Evaluation]", "[End of Score]"]
|