sdg-hub 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +3 -0
- sdg_hub/_version.py +21 -0
- sdg_hub/blocks/__init__.py +36 -0
- sdg_hub/blocks/block.py +96 -0
- sdg_hub/blocks/llmblock.py +375 -0
- sdg_hub/blocks/utilblocks.py +597 -0
- sdg_hub/checkpointer.py +139 -0
- sdg_hub/configs/__init__.py +0 -0
- sdg_hub/configs/annotations/__init__.py +0 -0
- sdg_hub/configs/annotations/cot_reflection.yaml +34 -0
- sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
- sdg_hub/configs/annotations/detailed_description.yaml +10 -0
- sdg_hub/configs/annotations/detailed_description_icl.yaml +32 -0
- sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
- sdg_hub/configs/knowledge/__init__.py +0 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +45 -0
- sdg_hub/configs/knowledge/auxilary_instructions.yaml +35 -0
- sdg_hub/configs/knowledge/detailed_summary.yaml +17 -0
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +68 -0
- sdg_hub/configs/knowledge/evaluate_question.yaml +38 -0
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml +85 -0
- sdg_hub/configs/knowledge/extractive_summary.yaml +17 -0
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +39 -0
- sdg_hub/configs/knowledge/generate_questions_responses.yaml +56 -0
- sdg_hub/configs/knowledge/mcq_generation.yaml +83 -0
- sdg_hub/configs/knowledge/router.yaml +12 -0
- sdg_hub/configs/knowledge/simple_generate_qa.yaml +34 -0
- sdg_hub/configs/reasoning/__init__.py +0 -0
- sdg_hub/configs/reasoning/dynamic_cot.yaml +40 -0
- sdg_hub/configs/skills/__init__.py +0 -0
- sdg_hub/configs/skills/analyzer.yaml +48 -0
- sdg_hub/configs/skills/annotation.yaml +36 -0
- sdg_hub/configs/skills/contexts.yaml +28 -0
- sdg_hub/configs/skills/critic.yaml +60 -0
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +111 -0
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +78 -0
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +119 -0
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml +51 -0
- sdg_hub/configs/skills/freeform_questions.yaml +34 -0
- sdg_hub/configs/skills/freeform_responses.yaml +39 -0
- sdg_hub/configs/skills/grounded_questions.yaml +38 -0
- sdg_hub/configs/skills/grounded_responses.yaml +59 -0
- sdg_hub/configs/skills/icl_examples/STEM.yaml +56 -0
- sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
- sdg_hub/configs/skills/icl_examples/coding.yaml +97 -0
- sdg_hub/configs/skills/icl_examples/extraction.yaml +36 -0
- sdg_hub/configs/skills/icl_examples/humanities.yaml +71 -0
- sdg_hub/configs/skills/icl_examples/math.yaml +85 -0
- sdg_hub/configs/skills/icl_examples/reasoning.yaml +30 -0
- sdg_hub/configs/skills/icl_examples/roleplay.yaml +45 -0
- sdg_hub/configs/skills/icl_examples/writing.yaml +80 -0
- sdg_hub/configs/skills/judge.yaml +53 -0
- sdg_hub/configs/skills/planner.yaml +67 -0
- sdg_hub/configs/skills/respond.yaml +8 -0
- sdg_hub/configs/skills/revised_responder.yaml +78 -0
- sdg_hub/configs/skills/router.yaml +59 -0
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +27 -0
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +31 -0
- sdg_hub/flow.py +306 -0
- sdg_hub/flow_runner.py +204 -0
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +13 -0
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +12 -0
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +89 -0
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +136 -0
- sdg_hub/flows/generation/skills/improve_responses.yaml +103 -0
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +12 -0
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +12 -0
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +80 -0
- sdg_hub/flows/generation/skills/synth_skills.yaml +59 -0
- sdg_hub/logger_config.py +20 -0
- sdg_hub/pipeline.py +121 -0
- sdg_hub/prompts.py +43 -0
- sdg_hub/py.typed +0 -0
- sdg_hub/registry.py +122 -0
- sdg_hub/sdg.py +206 -0
- sdg_hub/utils/__init__.py +5 -0
- sdg_hub/utils/datautils.py +14 -0
- sdg_hub-0.1.0.dist-info/METADATA +190 -0
- sdg_hub-0.1.0.dist-info/RECORD +82 -0
- sdg_hub-0.1.0.dist-info/WHEEL +5 -0
- sdg_hub-0.1.0.dist-info/licenses/LICENSE +201 -0
- sdg_hub-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,136 @@
|
|
1
|
+
- block_type: DuplicateColumns
|
2
|
+
block_config:
|
3
|
+
block_name: duplicate_document_col
|
4
|
+
columns_map:
|
5
|
+
document: base_document
|
6
|
+
|
7
|
+
- block_type: LLMBlock
|
8
|
+
block_config:
|
9
|
+
block_name: gen_detailed_summary
|
10
|
+
config_path: configs/knowledge/detailed_summary.yaml
|
11
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
12
|
+
output_cols:
|
13
|
+
- summary_detailed
|
14
|
+
gen_kwargs:
|
15
|
+
max_tokens: 2048
|
16
|
+
|
17
|
+
- block_type: LLMBlock
|
18
|
+
block_config:
|
19
|
+
block_name: gen_atomic_facts
|
20
|
+
config_path: configs/knowledge/atomic_facts.yaml
|
21
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
22
|
+
output_cols:
|
23
|
+
- summary_atomic_facts
|
24
|
+
gen_kwargs:
|
25
|
+
max_tokens: 2048
|
26
|
+
|
27
|
+
- block_type: LLMBlock
|
28
|
+
block_config:
|
29
|
+
block_name: gen_extractive_summary
|
30
|
+
config_path: configs/knowledge/extractive_summary.yaml
|
31
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
32
|
+
output_cols:
|
33
|
+
- summary_extractive
|
34
|
+
gen_kwargs:
|
35
|
+
max_tokens: 2048
|
36
|
+
|
37
|
+
- block_type: FlattenColumnsBlock
|
38
|
+
block_config:
|
39
|
+
block_name: flatten_summary_columns
|
40
|
+
var_cols:
|
41
|
+
- summary_detailed
|
42
|
+
- summary_extractive
|
43
|
+
- summary_atomic_facts
|
44
|
+
- base_document
|
45
|
+
value_name: summary
|
46
|
+
var_name: dataset_type
|
47
|
+
|
48
|
+
- block_type: RenameColumns
|
49
|
+
block_config:
|
50
|
+
block_name: rename_to_document_column
|
51
|
+
columns_map:
|
52
|
+
document: raw_document
|
53
|
+
summary: document
|
54
|
+
|
55
|
+
- block_type: LLMBlock
|
56
|
+
block_config:
|
57
|
+
block_name: knowledge generation
|
58
|
+
config_path: configs/knowledge/generate_questions_responses.yaml
|
59
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
60
|
+
output_cols:
|
61
|
+
- question
|
62
|
+
- response
|
63
|
+
parser_kwargs:
|
64
|
+
parser_name: custom
|
65
|
+
parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*\\[(?:Answer|ANSWER)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
|
66
|
+
parser_cleanup_tags:
|
67
|
+
- "[END]"
|
68
|
+
gen_kwargs:
|
69
|
+
temperature: 0.0
|
70
|
+
max_tokens: 2048
|
71
|
+
|
72
|
+
- block_type: LLMBlock
|
73
|
+
block_config:
|
74
|
+
block_name: eval_faithfulness_qa_pair
|
75
|
+
config_path: configs/knowledge/evaluate_faithfulness.yaml
|
76
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
77
|
+
output_cols:
|
78
|
+
- explanation
|
79
|
+
- judgment
|
80
|
+
gen_kwargs:
|
81
|
+
max_tokens: 2048
|
82
|
+
|
83
|
+
- block_type: FilterByValueBlock
|
84
|
+
block_config:
|
85
|
+
block_name: filter_faithfulness
|
86
|
+
filter_column: judgment
|
87
|
+
filter_value: "YES"
|
88
|
+
operation: operator.eq
|
89
|
+
drop_columns:
|
90
|
+
- judgment
|
91
|
+
- explanation
|
92
|
+
|
93
|
+
- block_type: LLMBlock
|
94
|
+
block_config:
|
95
|
+
block_name: eval_relevancy_qa_pair
|
96
|
+
config_path: configs/knowledge/evaluate_relevancy.yaml
|
97
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
98
|
+
output_cols:
|
99
|
+
- feedback
|
100
|
+
- score
|
101
|
+
gen_kwargs:
|
102
|
+
max_tokens: 2048
|
103
|
+
|
104
|
+
- block_type: FilterByValueBlock
|
105
|
+
block_config:
|
106
|
+
block_name: filter_relevancy
|
107
|
+
filter_column: score
|
108
|
+
filter_value: 2.0
|
109
|
+
operation: operator.eq
|
110
|
+
convert_dtype: float
|
111
|
+
drop_columns:
|
112
|
+
- feedback
|
113
|
+
- score
|
114
|
+
|
115
|
+
- block_type: LLMBlock
|
116
|
+
block_config:
|
117
|
+
block_name: eval_verify_question
|
118
|
+
config_path: configs/knowledge/evaluate_question.yaml
|
119
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
120
|
+
output_cols:
|
121
|
+
- explanation
|
122
|
+
- rating
|
123
|
+
gen_kwargs:
|
124
|
+
max_tokens: 2048
|
125
|
+
|
126
|
+
- block_type: FilterByValueBlock
|
127
|
+
block_config:
|
128
|
+
block_name: filter_verify_question
|
129
|
+
filter_column: rating
|
130
|
+
filter_value: 1.0
|
131
|
+
operation: operator.eq
|
132
|
+
convert_dtype: float
|
133
|
+
drop_columns:
|
134
|
+
- explanation
|
135
|
+
- rating
|
136
|
+
- __index_level_0__
|
@@ -0,0 +1,103 @@
|
|
1
|
+
- block_type: LLMBlock
|
2
|
+
block_config:
|
3
|
+
block_name: router
|
4
|
+
config_path: configs/skills/router.yaml
|
5
|
+
model_id: meta-llama/Llama-3.3-70B-Instruct
|
6
|
+
output_cols:
|
7
|
+
- route
|
8
|
+
gen_kwargs:
|
9
|
+
temperature: 0
|
10
|
+
max_tokens: 5
|
11
|
+
extra_body:
|
12
|
+
guided_choice:
|
13
|
+
- "coding"
|
14
|
+
- "extraction"
|
15
|
+
- "humanities"
|
16
|
+
- "math"
|
17
|
+
- "reasoning"
|
18
|
+
- "roleplay"
|
19
|
+
- "STEM"
|
20
|
+
- "writing"
|
21
|
+
- block_type: SamplePopulatorBlock
|
22
|
+
block_config:
|
23
|
+
block_name: icl_populator
|
24
|
+
config_paths:
|
25
|
+
- configs/skills/icl_examples/coding.yaml
|
26
|
+
- configs/skills/icl_examples/extraction.yaml
|
27
|
+
- configs/skills/icl_examples/humanities.yaml
|
28
|
+
- configs/skills/icl_examples/math.yaml
|
29
|
+
- configs/skills/icl_examples/reasoning.yaml
|
30
|
+
- configs/skills/icl_examples/roleplay.yaml
|
31
|
+
- configs/skills/icl_examples/STEM.yaml
|
32
|
+
- configs/skills/icl_examples/writing.yaml
|
33
|
+
column_name: route
|
34
|
+
batch_kwargs:
|
35
|
+
num_procs: 8
|
36
|
+
- block_type: LLMBlock
|
37
|
+
block_config:
|
38
|
+
block_name: analyzer
|
39
|
+
config_path: configs/skills/analyzer.yaml
|
40
|
+
model_id: meta-llama/Llama-3.3-70B-Instruct
|
41
|
+
output_cols:
|
42
|
+
- analysis
|
43
|
+
- rubric
|
44
|
+
- block_type: LLMBlock
|
45
|
+
block_config:
|
46
|
+
block_name: critic
|
47
|
+
config_path: configs/skills/critic.yaml
|
48
|
+
model_id: meta-llama/Llama-3.3-70B-Instruct
|
49
|
+
output_cols:
|
50
|
+
- critique
|
51
|
+
- block_type: LLMBlock
|
52
|
+
block_config:
|
53
|
+
block_name: planner
|
54
|
+
config_path: configs/skills/planner.yaml
|
55
|
+
model_id: meta-llama/Llama-3.3-70B-Instruct
|
56
|
+
output_cols:
|
57
|
+
- plan
|
58
|
+
- block_type: LLMBlock
|
59
|
+
block_config:
|
60
|
+
block_name: revised_responder
|
61
|
+
config_path: configs/skills/revised_responder.yaml
|
62
|
+
model_id: meta-llama/Llama-3.3-70B-Instruct
|
63
|
+
output_cols:
|
64
|
+
- revised_response
|
65
|
+
drop_columns:
|
66
|
+
- icl_query
|
67
|
+
- icl_response
|
68
|
+
- icl_analysis
|
69
|
+
- icl_rubric
|
70
|
+
- icl_critique
|
71
|
+
- icl_plan
|
72
|
+
- icl_revised_response
|
73
|
+
- block_type: LLMBlock
|
74
|
+
block_config:
|
75
|
+
block_name: judge
|
76
|
+
config_path: configs/skills/judge.yaml
|
77
|
+
model_id: meta-llama/Llama-3.3-70B-Instruct
|
78
|
+
output_cols:
|
79
|
+
- judgement
|
80
|
+
- verdict
|
81
|
+
- block_type: FilterByValueBlock
|
82
|
+
block_config:
|
83
|
+
block_name: filter_judgement
|
84
|
+
filter_column: verdict
|
85
|
+
filter_value:
|
86
|
+
- Assistant A
|
87
|
+
- Assistant B
|
88
|
+
operation: operator.contains
|
89
|
+
batch_kwargs:
|
90
|
+
num_procs: 8
|
91
|
+
- block_type: SelectorBlock
|
92
|
+
block_config:
|
93
|
+
block_name: response_selector
|
94
|
+
choice_map:
|
95
|
+
Assistant A: "response"
|
96
|
+
Assistant B: "revised_response"
|
97
|
+
choice_col: verdict
|
98
|
+
output_col: chosen_response
|
99
|
+
batch_kwargs:
|
100
|
+
num_procs: 8
|
101
|
+
drop_columns:
|
102
|
+
- judgemnent
|
103
|
+
- verdict
|
@@ -0,0 +1,12 @@
|
|
1
|
+
- block_type: LLMBlock
|
2
|
+
block_config:
|
3
|
+
block_name: gen_skill_freeform
|
4
|
+
config_path: configs/skills/simple_generate_qa_freeform.yaml
|
5
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
6
|
+
output_cols:
|
7
|
+
- output
|
8
|
+
gen_kwargs:
|
9
|
+
temperature: 0.7
|
10
|
+
max_tokens: 2048
|
11
|
+
drop_duplicates:
|
12
|
+
- output
|
@@ -0,0 +1,12 @@
|
|
1
|
+
- block_type: LLMBlock
|
2
|
+
block_config:
|
3
|
+
block_name: gen_skill_grounded
|
4
|
+
config_path: configs/skills/simple_generate_qa_grounded.yaml
|
5
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
6
|
+
output_cols:
|
7
|
+
- output
|
8
|
+
gen_kwargs:
|
9
|
+
temperature: 0.7
|
10
|
+
max_tokens: 2048
|
11
|
+
drop_duplicates:
|
12
|
+
- output
|
@@ -0,0 +1,80 @@
|
|
1
|
+
- block_type: LLMBlock
|
2
|
+
block_config:
|
3
|
+
block_name: gen_contexts
|
4
|
+
config_path: configs/skills/contexts.yaml
|
5
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
6
|
+
output_cols:
|
7
|
+
- context
|
8
|
+
gen_kwargs:
|
9
|
+
temperature: 0.7
|
10
|
+
max_tokens: 2048
|
11
|
+
n: 10
|
12
|
+
seed: 42
|
13
|
+
drop_duplicates:
|
14
|
+
- context
|
15
|
+
- block_type: LLMBlock
|
16
|
+
block_config:
|
17
|
+
block_name: gen_grounded_questions
|
18
|
+
config_path: configs/skills/grounded_questions.yaml
|
19
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
20
|
+
output_cols:
|
21
|
+
- question
|
22
|
+
batch_kwargs:
|
23
|
+
num_samples: 3
|
24
|
+
drop_duplicates:
|
25
|
+
- question
|
26
|
+
- block_type: LLMBlock
|
27
|
+
block_config:
|
28
|
+
block_name: eval_grounded_questions
|
29
|
+
config_path: configs/skills/evaluate_grounded_questions.yaml
|
30
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
31
|
+
output_cols:
|
32
|
+
- evaluation
|
33
|
+
- score
|
34
|
+
- block_type: FilterByValueBlock
|
35
|
+
block_config:
|
36
|
+
block_name: filter_grounded_questions
|
37
|
+
filter_column: score
|
38
|
+
filter_value: 1.0
|
39
|
+
operation: operator.eq
|
40
|
+
convert_dtype: float
|
41
|
+
batch_kwargs:
|
42
|
+
num_procs: 8
|
43
|
+
drop_columns:
|
44
|
+
- evaluation
|
45
|
+
- score
|
46
|
+
- num_samples
|
47
|
+
- block_type: LLMBlock
|
48
|
+
block_config:
|
49
|
+
block_name: gen_grounded_responses
|
50
|
+
config_path: configs/skills/grounded_responses.yaml
|
51
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
52
|
+
output_cols:
|
53
|
+
- response
|
54
|
+
- block_type: LLMBlock
|
55
|
+
block_config:
|
56
|
+
block_name: evaluate_grounded_qa_pair
|
57
|
+
config_path: configs/skills/evaluate_grounded_pair.yaml
|
58
|
+
model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
|
59
|
+
output_cols:
|
60
|
+
- evaluation
|
61
|
+
- score
|
62
|
+
- block_type: FilterByValueBlock
|
63
|
+
block_config:
|
64
|
+
block_name: filter_grounded_qa_pair
|
65
|
+
filter_column: score
|
66
|
+
filter_value: 2.0
|
67
|
+
operation: operator.ge
|
68
|
+
convert_dtype: float
|
69
|
+
batch_kwargs:
|
70
|
+
num_procs: 8
|
71
|
+
- block_type: CombineColumnsBlock
|
72
|
+
block_config:
|
73
|
+
block_name: combine_question_and_context
|
74
|
+
columns:
|
75
|
+
- context
|
76
|
+
- question
|
77
|
+
output_col: question
|
78
|
+
batch_kwargs:
|
79
|
+
num_procs: 8
|
80
|
+
batched: True
|
@@ -0,0 +1,59 @@
|
|
1
|
+
- block_type: LLMBlock
|
2
|
+
block_config:
|
3
|
+
block_name: gen_questions
|
4
|
+
config_path: configs/skills/freeform_questions.yaml
|
5
|
+
model_id: meta-llama/Llama-3.3-70B-Instruct
|
6
|
+
output_cols:
|
7
|
+
- question
|
8
|
+
batch_kwargs:
|
9
|
+
num_samples: 30
|
10
|
+
drop_duplicates:
|
11
|
+
- question
|
12
|
+
- block_type: LLMBlock
|
13
|
+
block_config:
|
14
|
+
block_name: eval_questions
|
15
|
+
config_path: configs/skills/evaluate_freeform_questions.yaml
|
16
|
+
model_id: meta-llama/Llama-3.3-70B-Instruct
|
17
|
+
output_cols:
|
18
|
+
- evaluation
|
19
|
+
- score
|
20
|
+
- block_type: FilterByValueBlock
|
21
|
+
block_config:
|
22
|
+
block_name: filter_questions
|
23
|
+
filter_column: score
|
24
|
+
filter_value: 1.0
|
25
|
+
operation: operator.eq
|
26
|
+
convert_dtype: float
|
27
|
+
batch_kwargs:
|
28
|
+
num_procs: 8
|
29
|
+
drop_columns:
|
30
|
+
- evaluation
|
31
|
+
- score
|
32
|
+
- num_samples
|
33
|
+
- block_type: LLMBlock
|
34
|
+
block_config:
|
35
|
+
block_name: gen_responses
|
36
|
+
config_path: configs/skills/freeform_responses.yaml
|
37
|
+
model_id: meta-llama/Llama-3.3-70B-Instruct
|
38
|
+
output_cols:
|
39
|
+
- response
|
40
|
+
- block_type: LLMBlock
|
41
|
+
block_config:
|
42
|
+
block_name: evaluate_qa_pair
|
43
|
+
config_path: configs/skills/evaluate_freeform_pair.yaml
|
44
|
+
model_id: meta-llama/Llama-3.3-70B-Instruct
|
45
|
+
output_cols:
|
46
|
+
- evaluation
|
47
|
+
- score
|
48
|
+
- block_type: FilterByValueBlock
|
49
|
+
block_config:
|
50
|
+
block_name: filter_qa_pair
|
51
|
+
filter_column: score
|
52
|
+
filter_value: 2.0
|
53
|
+
operation: operator.ge
|
54
|
+
convert_dtype: float
|
55
|
+
batch_kwargs:
|
56
|
+
num_procs: 8
|
57
|
+
drop_columns:
|
58
|
+
- evaluation
|
59
|
+
- score
|
sdg_hub/logger_config.py
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
# Standard
|
3
|
+
import os
|
4
|
+
import logging
|
5
|
+
|
6
|
+
# Third Party
|
7
|
+
from rich.logging import RichHandler
|
8
|
+
|
9
|
+
|
10
|
+
def setup_logger(name):
|
11
|
+
# Set up the logger
|
12
|
+
log_level = os.getenv("LOG_LEVEL", "INFO")
|
13
|
+
logging.basicConfig(
|
14
|
+
level=log_level,
|
15
|
+
format="%(message)s",
|
16
|
+
datefmt="[%X]",
|
17
|
+
handlers=[RichHandler()],
|
18
|
+
)
|
19
|
+
logger = logging.getLogger(name)
|
20
|
+
return logger
|
sdg_hub/pipeline.py
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
"""
|
2
|
+
Deprecated Pipeline class for data generation pipelines.
|
3
|
+
|
4
|
+
Use the Flow class directly for new code.
|
5
|
+
"""
|
6
|
+
|
7
|
+
# SPDX-License-Identifier: Apache-2.0
|
8
|
+
# Standard
|
9
|
+
import warnings
|
10
|
+
from typing import List, Dict, Any
|
11
|
+
|
12
|
+
# Third Party
|
13
|
+
from datasets import Dataset
|
14
|
+
from datasets.data_files import EmptyDatasetError
|
15
|
+
|
16
|
+
# Local
|
17
|
+
from .logger_config import setup_logger
|
18
|
+
|
19
|
+
logger = setup_logger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
class Pipeline:
|
23
|
+
"""A class representing a data generation pipeline.
|
24
|
+
|
25
|
+
This class is deprecated and will be removed in a future version.
|
26
|
+
Use the Flow class directly instead.
|
27
|
+
|
28
|
+
Parameters
|
29
|
+
----------
|
30
|
+
chained_blocks : List[Dict[str, Any]]
|
31
|
+
List of block configurations to execute in sequence.
|
32
|
+
|
33
|
+
Attributes
|
34
|
+
----------
|
35
|
+
chained_blocks : List[Dict[str, Any]]
|
36
|
+
List of block configurations to execute in sequence.
|
37
|
+
"""
|
38
|
+
|
39
|
+
def __init__(self, chained_blocks: List[Dict[str, Any]]) -> None:
|
40
|
+
"""
|
41
|
+
Initialize the Pipeline class with a configuration dictionary.
|
42
|
+
|
43
|
+
DEPRECATED: This class is deprecated and will be removed in a future version.
|
44
|
+
Use the Flow class directly instead.
|
45
|
+
"""
|
46
|
+
warnings.warn(
|
47
|
+
"Pipeline class is deprecated and will be removed in a future version. "
|
48
|
+
"Use Flow class directly instead of wrapping it with Pipeline.",
|
49
|
+
DeprecationWarning,
|
50
|
+
stacklevel=2
|
51
|
+
)
|
52
|
+
# pipeline config is the run configuration that consists of the pipeline steps
|
53
|
+
self.chained_blocks = chained_blocks
|
54
|
+
|
55
|
+
def _drop_duplicates(self, dataset: Dataset, cols: List[str]) -> Dataset:
|
56
|
+
"""Drop duplicates from the dataset based on the columns provided.
|
57
|
+
|
58
|
+
Parameters
|
59
|
+
----------
|
60
|
+
dataset : Dataset
|
61
|
+
The input dataset.
|
62
|
+
cols : List[str]
|
63
|
+
Columns to consider for duplicate detection.
|
64
|
+
|
65
|
+
Returns
|
66
|
+
-------
|
67
|
+
Dataset
|
68
|
+
Dataset with duplicates removed.
|
69
|
+
"""
|
70
|
+
df = dataset.to_pandas()
|
71
|
+
df = df.drop_duplicates(subset=cols).reset_index(drop=True)
|
72
|
+
return Dataset.from_pandas(df)
|
73
|
+
|
74
|
+
def generate(self, dataset: Dataset) -> Dataset:
|
75
|
+
"""Generate the dataset by running the pipeline steps.
|
76
|
+
|
77
|
+
Parameters
|
78
|
+
----------
|
79
|
+
dataset : Dataset
|
80
|
+
The input dataset to process.
|
81
|
+
|
82
|
+
Returns
|
83
|
+
-------
|
84
|
+
Dataset
|
85
|
+
The processed dataset.
|
86
|
+
|
87
|
+
Raises
|
88
|
+
------
|
89
|
+
EmptyDatasetError
|
90
|
+
If a block produces an empty dataset.
|
91
|
+
"""
|
92
|
+
for block_prop in self.chained_blocks:
|
93
|
+
block_type = block_prop["block_type"]
|
94
|
+
block_config = block_prop["block_config"]
|
95
|
+
drop_columns = block_prop.get("drop_columns", [])
|
96
|
+
gen_kwargs = block_prop.get("gen_kwargs", {})
|
97
|
+
drop_duplicates_cols = block_prop.get("drop_duplicates", False)
|
98
|
+
block = block_type(**block_config)
|
99
|
+
|
100
|
+
logger.debug("------------------------------------\n")
|
101
|
+
logger.debug("Running block: %s", block_config["block_name"])
|
102
|
+
logger.debug("Input dataset: %s", dataset)
|
103
|
+
|
104
|
+
dataset = block.generate(dataset, **gen_kwargs)
|
105
|
+
|
106
|
+
if len(dataset) == 0:
|
107
|
+
raise EmptyDatasetError(
|
108
|
+
f"Pipeline stopped: Empty dataset after running block: {block_config['block_name']}"
|
109
|
+
)
|
110
|
+
|
111
|
+
drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names]
|
112
|
+
if drop_columns:
|
113
|
+
dataset = dataset.remove_columns(drop_columns_in_ds)
|
114
|
+
|
115
|
+
if drop_duplicates_cols:
|
116
|
+
dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
|
117
|
+
|
118
|
+
logger.debug("Output dataset: %s", dataset)
|
119
|
+
logger.debug("------------------------------------\n\n")
|
120
|
+
|
121
|
+
return dataset
|
sdg_hub/prompts.py
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# Local
|
2
|
+
from .registry import PromptRegistry
|
3
|
+
|
4
|
+
|
5
|
+
@PromptRegistry.register("blank")
|
6
|
+
def blank_chat_template():
|
7
|
+
return """{{ messages }}"""
|
8
|
+
|
9
|
+
|
10
|
+
@PromptRegistry.register("instructlab")
|
11
|
+
def instructlab_chat_template():
|
12
|
+
return """{% for message in messages %}{% if message['role'] == 'pretraining' %}{{ '<|pretrain|>' + message['content'] + '<|endoftext|>' + '<|/pretrain|>' }}{% elif message['role'] == 'system' %}{{ '<|system|>' + '\n' + message['content'] + '\n' }}{% elif message['role'] == 'user' %}{{ '<|user|>' + '\n' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n') }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' + '\n' }}{% endif %}{% endfor %}"""
|
13
|
+
|
14
|
+
|
15
|
+
@PromptRegistry.register("mistralai")
|
16
|
+
def mistral_chat_template():
|
17
|
+
return """{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n<s>\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + '</s>'}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n"""
|
18
|
+
|
19
|
+
|
20
|
+
@PromptRegistry.register("meta-llama/Llama-3.3")
|
21
|
+
def meta_llama_chat_template():
|
22
|
+
return """{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n"""
|
23
|
+
|
24
|
+
|
25
|
+
@PromptRegistry.register("microsoft/phi-4")
|
26
|
+
def microsoft_phi_chat_template():
|
27
|
+
return """{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>' + message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}"""
|
28
|
+
|
29
|
+
@PromptRegistry.register("nvidia/Llama-3_3-Nemotron-Super-49B-v1")
|
30
|
+
def nemotron_chat_template():
|
31
|
+
return """{{- bos_token }}
|
32
|
+
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}detailed thinking on{{- "<|eot_id|>" }}
|
33
|
+
{%- for message in messages %}
|
34
|
+
{%- if message['role'] == 'assistant' and '</think>' in message['content'] %}
|
35
|
+
{%- set content = message['content'].split('</think>')[-1].lstrip() %}
|
36
|
+
{%- else %}
|
37
|
+
{%- set content = message['content'] %}
|
38
|
+
{%- endif %}
|
39
|
+
{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + content | trim + '<|eot_id|>' }}
|
40
|
+
{%- endfor %}
|
41
|
+
{%- if add_generation_prompt %}
|
42
|
+
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
|
43
|
+
{%- endif %}"""
|
sdg_hub/py.typed
ADDED
File without changes
|