sdg-hub 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. sdg_hub/__init__.py +3 -0
  2. sdg_hub/_version.py +21 -0
  3. sdg_hub/blocks/__init__.py +36 -0
  4. sdg_hub/blocks/block.py +96 -0
  5. sdg_hub/blocks/llmblock.py +375 -0
  6. sdg_hub/blocks/utilblocks.py +597 -0
  7. sdg_hub/checkpointer.py +139 -0
  8. sdg_hub/configs/__init__.py +0 -0
  9. sdg_hub/configs/annotations/__init__.py +0 -0
  10. sdg_hub/configs/annotations/cot_reflection.yaml +34 -0
  11. sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
  12. sdg_hub/configs/annotations/detailed_description.yaml +10 -0
  13. sdg_hub/configs/annotations/detailed_description_icl.yaml +32 -0
  14. sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
  15. sdg_hub/configs/knowledge/__init__.py +0 -0
  16. sdg_hub/configs/knowledge/atomic_facts.yaml +45 -0
  17. sdg_hub/configs/knowledge/auxilary_instructions.yaml +35 -0
  18. sdg_hub/configs/knowledge/detailed_summary.yaml +17 -0
  19. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +68 -0
  20. sdg_hub/configs/knowledge/evaluate_question.yaml +38 -0
  21. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +85 -0
  22. sdg_hub/configs/knowledge/extractive_summary.yaml +17 -0
  23. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +39 -0
  24. sdg_hub/configs/knowledge/generate_questions_responses.yaml +56 -0
  25. sdg_hub/configs/knowledge/mcq_generation.yaml +83 -0
  26. sdg_hub/configs/knowledge/router.yaml +12 -0
  27. sdg_hub/configs/knowledge/simple_generate_qa.yaml +34 -0
  28. sdg_hub/configs/reasoning/__init__.py +0 -0
  29. sdg_hub/configs/reasoning/dynamic_cot.yaml +40 -0
  30. sdg_hub/configs/skills/__init__.py +0 -0
  31. sdg_hub/configs/skills/analyzer.yaml +48 -0
  32. sdg_hub/configs/skills/annotation.yaml +36 -0
  33. sdg_hub/configs/skills/contexts.yaml +28 -0
  34. sdg_hub/configs/skills/critic.yaml +60 -0
  35. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +111 -0
  36. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +78 -0
  37. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +119 -0
  38. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +51 -0
  39. sdg_hub/configs/skills/freeform_questions.yaml +34 -0
  40. sdg_hub/configs/skills/freeform_responses.yaml +39 -0
  41. sdg_hub/configs/skills/grounded_questions.yaml +38 -0
  42. sdg_hub/configs/skills/grounded_responses.yaml +59 -0
  43. sdg_hub/configs/skills/icl_examples/STEM.yaml +56 -0
  44. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  45. sdg_hub/configs/skills/icl_examples/coding.yaml +97 -0
  46. sdg_hub/configs/skills/icl_examples/extraction.yaml +36 -0
  47. sdg_hub/configs/skills/icl_examples/humanities.yaml +71 -0
  48. sdg_hub/configs/skills/icl_examples/math.yaml +85 -0
  49. sdg_hub/configs/skills/icl_examples/reasoning.yaml +30 -0
  50. sdg_hub/configs/skills/icl_examples/roleplay.yaml +45 -0
  51. sdg_hub/configs/skills/icl_examples/writing.yaml +80 -0
  52. sdg_hub/configs/skills/judge.yaml +53 -0
  53. sdg_hub/configs/skills/planner.yaml +67 -0
  54. sdg_hub/configs/skills/respond.yaml +8 -0
  55. sdg_hub/configs/skills/revised_responder.yaml +78 -0
  56. sdg_hub/configs/skills/router.yaml +59 -0
  57. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +27 -0
  58. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +31 -0
  59. sdg_hub/flow.py +306 -0
  60. sdg_hub/flow_runner.py +204 -0
  61. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +13 -0
  62. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +12 -0
  63. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +89 -0
  64. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +136 -0
  65. sdg_hub/flows/generation/skills/improve_responses.yaml +103 -0
  66. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +12 -0
  67. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +12 -0
  68. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +80 -0
  69. sdg_hub/flows/generation/skills/synth_skills.yaml +59 -0
  70. sdg_hub/logger_config.py +20 -0
  71. sdg_hub/pipeline.py +121 -0
  72. sdg_hub/prompts.py +43 -0
  73. sdg_hub/py.typed +0 -0
  74. sdg_hub/registry.py +122 -0
  75. sdg_hub/sdg.py +206 -0
  76. sdg_hub/utils/__init__.py +5 -0
  77. sdg_hub/utils/datautils.py +14 -0
  78. sdg_hub-0.1.0.dist-info/METADATA +190 -0
  79. sdg_hub-0.1.0.dist-info/RECORD +82 -0
  80. sdg_hub-0.1.0.dist-info/WHEEL +5 -0
  81. sdg_hub-0.1.0.dist-info/licenses/LICENSE +201 -0
  82. sdg_hub-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,136 @@
1
+ - block_type: DuplicateColumns
2
+ block_config:
3
+ block_name: duplicate_document_col
4
+ columns_map:
5
+ document: base_document
6
+
7
+ - block_type: LLMBlock
8
+ block_config:
9
+ block_name: gen_detailed_summary
10
+ config_path: configs/knowledge/detailed_summary.yaml
11
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
12
+ output_cols:
13
+ - summary_detailed
14
+ gen_kwargs:
15
+ max_tokens: 2048
16
+
17
+ - block_type: LLMBlock
18
+ block_config:
19
+ block_name: gen_atomic_facts
20
+ config_path: configs/knowledge/atomic_facts.yaml
21
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
22
+ output_cols:
23
+ - summary_atomic_facts
24
+ gen_kwargs:
25
+ max_tokens: 2048
26
+
27
+ - block_type: LLMBlock
28
+ block_config:
29
+ block_name: gen_extractive_summary
30
+ config_path: configs/knowledge/extractive_summary.yaml
31
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
32
+ output_cols:
33
+ - summary_extractive
34
+ gen_kwargs:
35
+ max_tokens: 2048
36
+
37
+ - block_type: FlattenColumnsBlock
38
+ block_config:
39
+ block_name: flatten_summary_columns
40
+ var_cols:
41
+ - summary_detailed
42
+ - summary_extractive
43
+ - summary_atomic_facts
44
+ - base_document
45
+ value_name: summary
46
+ var_name: dataset_type
47
+
48
+ - block_type: RenameColumns
49
+ block_config:
50
+ block_name: rename_to_document_column
51
+ columns_map:
52
+ document: raw_document
53
+ summary: document
54
+
55
+ - block_type: LLMBlock
56
+ block_config:
57
+ block_name: knowledge generation
58
+ config_path: configs/knowledge/generate_questions_responses.yaml
59
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
60
+ output_cols:
61
+ - question
62
+ - response
63
+ parser_kwargs:
64
+ parser_name: custom
65
+ parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*\\[(?:Answer|ANSWER)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
66
+ parser_cleanup_tags:
67
+ - "[END]"
68
+ gen_kwargs:
69
+ temperature: 0.0
70
+ max_tokens: 2048
71
+
72
+ - block_type: LLMBlock
73
+ block_config:
74
+ block_name: eval_faithfulness_qa_pair
75
+ config_path: configs/knowledge/evaluate_faithfulness.yaml
76
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
77
+ output_cols:
78
+ - explanation
79
+ - judgment
80
+ gen_kwargs:
81
+ max_tokens: 2048
82
+
83
+ - block_type: FilterByValueBlock
84
+ block_config:
85
+ block_name: filter_faithfulness
86
+ filter_column: judgment
87
+ filter_value: "YES"
88
+ operation: operator.eq
89
+ drop_columns:
90
+ - judgment
91
+ - explanation
92
+
93
+ - block_type: LLMBlock
94
+ block_config:
95
+ block_name: eval_relevancy_qa_pair
96
+ config_path: configs/knowledge/evaluate_relevancy.yaml
97
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
98
+ output_cols:
99
+ - feedback
100
+ - score
101
+ gen_kwargs:
102
+ max_tokens: 2048
103
+
104
+ - block_type: FilterByValueBlock
105
+ block_config:
106
+ block_name: filter_relevancy
107
+ filter_column: score
108
+ filter_value: 2.0
109
+ operation: operator.eq
110
+ convert_dtype: float
111
+ drop_columns:
112
+ - feedback
113
+ - score
114
+
115
+ - block_type: LLMBlock
116
+ block_config:
117
+ block_name: eval_verify_question
118
+ config_path: configs/knowledge/evaluate_question.yaml
119
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
120
+ output_cols:
121
+ - explanation
122
+ - rating
123
+ gen_kwargs:
124
+ max_tokens: 2048
125
+
126
+ - block_type: FilterByValueBlock
127
+ block_config:
128
+ block_name: filter_verify_question
129
+ filter_column: rating
130
+ filter_value: 1.0
131
+ operation: operator.eq
132
+ convert_dtype: float
133
+ drop_columns:
134
+ - explanation
135
+ - rating
136
+ - __index_level_0__
@@ -0,0 +1,103 @@
1
+ - block_type: LLMBlock
2
+ block_config:
3
+ block_name: router
4
+ config_path: configs/skills/router.yaml
5
+ model_id: meta-llama/Llama-3.3-70B-Instruct
6
+ output_cols:
7
+ - route
8
+ gen_kwargs:
9
+ temperature: 0
10
+ max_tokens: 5
11
+ extra_body:
12
+ guided_choice:
13
+ - "coding"
14
+ - "extraction"
15
+ - "humanities"
16
+ - "math"
17
+ - "reasoning"
18
+ - "roleplay"
19
+ - "STEM"
20
+ - "writing"
21
+ - block_type: SamplePopulatorBlock
22
+ block_config:
23
+ block_name: icl_populator
24
+ config_paths:
25
+ - configs/skills/icl_examples/coding.yaml
26
+ - configs/skills/icl_examples/extraction.yaml
27
+ - configs/skills/icl_examples/humanities.yaml
28
+ - configs/skills/icl_examples/math.yaml
29
+ - configs/skills/icl_examples/reasoning.yaml
30
+ - configs/skills/icl_examples/roleplay.yaml
31
+ - configs/skills/icl_examples/STEM.yaml
32
+ - configs/skills/icl_examples/writing.yaml
33
+ column_name: route
34
+ batch_kwargs:
35
+ num_procs: 8
36
+ - block_type: LLMBlock
37
+ block_config:
38
+ block_name: analyzer
39
+ config_path: configs/skills/analyzer.yaml
40
+ model_id: meta-llama/Llama-3.3-70B-Instruct
41
+ output_cols:
42
+ - analysis
43
+ - rubric
44
+ - block_type: LLMBlock
45
+ block_config:
46
+ block_name: critic
47
+ config_path: configs/skills/critic.yaml
48
+ model_id: meta-llama/Llama-3.3-70B-Instruct
49
+ output_cols:
50
+ - critique
51
+ - block_type: LLMBlock
52
+ block_config:
53
+ block_name: planner
54
+ config_path: configs/skills/planner.yaml
55
+ model_id: meta-llama/Llama-3.3-70B-Instruct
56
+ output_cols:
57
+ - plan
58
+ - block_type: LLMBlock
59
+ block_config:
60
+ block_name: revised_responder
61
+ config_path: configs/skills/revised_responder.yaml
62
+ model_id: meta-llama/Llama-3.3-70B-Instruct
63
+ output_cols:
64
+ - revised_response
65
+ drop_columns:
66
+ - icl_query
67
+ - icl_response
68
+ - icl_analysis
69
+ - icl_rubric
70
+ - icl_critique
71
+ - icl_plan
72
+ - icl_revised_response
73
+ - block_type: LLMBlock
74
+ block_config:
75
+ block_name: judge
76
+ config_path: configs/skills/judge.yaml
77
+ model_id: meta-llama/Llama-3.3-70B-Instruct
78
+ output_cols:
79
+ - judgement
80
+ - verdict
81
+ - block_type: FilterByValueBlock
82
+ block_config:
83
+ block_name: filter_judgement
84
+ filter_column: verdict
85
+ filter_value:
86
+ - Assistant A
87
+ - Assistant B
88
+ operation: operator.contains
89
+ batch_kwargs:
90
+ num_procs: 8
91
+ - block_type: SelectorBlock
92
+ block_config:
93
+ block_name: response_selector
94
+ choice_map:
95
+ Assistant A: "response"
96
+ Assistant B: "revised_response"
97
+ choice_col: verdict
98
+ output_col: chosen_response
99
+ batch_kwargs:
100
+ num_procs: 8
101
+ drop_columns:
102
+ - judgemnent
103
+ - verdict
@@ -0,0 +1,12 @@
1
+ - block_type: LLMBlock
2
+ block_config:
3
+ block_name: gen_skill_freeform
4
+ config_path: configs/skills/simple_generate_qa_freeform.yaml
5
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
6
+ output_cols:
7
+ - output
8
+ gen_kwargs:
9
+ temperature: 0.7
10
+ max_tokens: 2048
11
+ drop_duplicates:
12
+ - output
@@ -0,0 +1,12 @@
1
+ - block_type: LLMBlock
2
+ block_config:
3
+ block_name: gen_skill_grounded
4
+ config_path: configs/skills/simple_generate_qa_grounded.yaml
5
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
6
+ output_cols:
7
+ - output
8
+ gen_kwargs:
9
+ temperature: 0.7
10
+ max_tokens: 2048
11
+ drop_duplicates:
12
+ - output
@@ -0,0 +1,80 @@
1
+ - block_type: LLMBlock
2
+ block_config:
3
+ block_name: gen_contexts
4
+ config_path: configs/skills/contexts.yaml
5
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
6
+ output_cols:
7
+ - context
8
+ gen_kwargs:
9
+ temperature: 0.7
10
+ max_tokens: 2048
11
+ n: 10
12
+ seed: 42
13
+ drop_duplicates:
14
+ - context
15
+ - block_type: LLMBlock
16
+ block_config:
17
+ block_name: gen_grounded_questions
18
+ config_path: configs/skills/grounded_questions.yaml
19
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
20
+ output_cols:
21
+ - question
22
+ batch_kwargs:
23
+ num_samples: 3
24
+ drop_duplicates:
25
+ - question
26
+ - block_type: LLMBlock
27
+ block_config:
28
+ block_name: eval_grounded_questions
29
+ config_path: configs/skills/evaluate_grounded_questions.yaml
30
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
31
+ output_cols:
32
+ - evaluation
33
+ - score
34
+ - block_type: FilterByValueBlock
35
+ block_config:
36
+ block_name: filter_grounded_questions
37
+ filter_column: score
38
+ filter_value: 1.0
39
+ operation: operator.eq
40
+ convert_dtype: float
41
+ batch_kwargs:
42
+ num_procs: 8
43
+ drop_columns:
44
+ - evaluation
45
+ - score
46
+ - num_samples
47
+ - block_type: LLMBlock
48
+ block_config:
49
+ block_name: gen_grounded_responses
50
+ config_path: configs/skills/grounded_responses.yaml
51
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
52
+ output_cols:
53
+ - response
54
+ - block_type: LLMBlock
55
+ block_config:
56
+ block_name: evaluate_grounded_qa_pair
57
+ config_path: configs/skills/evaluate_grounded_pair.yaml
58
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
59
+ output_cols:
60
+ - evaluation
61
+ - score
62
+ - block_type: FilterByValueBlock
63
+ block_config:
64
+ block_name: filter_grounded_qa_pair
65
+ filter_column: score
66
+ filter_value: 2.0
67
+ operation: operator.ge
68
+ convert_dtype: float
69
+ batch_kwargs:
70
+ num_procs: 8
71
+ - block_type: CombineColumnsBlock
72
+ block_config:
73
+ block_name: combine_question_and_context
74
+ columns:
75
+ - context
76
+ - question
77
+ output_col: question
78
+ batch_kwargs:
79
+ num_procs: 8
80
+ batched: True
@@ -0,0 +1,59 @@
1
+ - block_type: LLMBlock
2
+ block_config:
3
+ block_name: gen_questions
4
+ config_path: configs/skills/freeform_questions.yaml
5
+ model_id: meta-llama/Llama-3.3-70B-Instruct
6
+ output_cols:
7
+ - question
8
+ batch_kwargs:
9
+ num_samples: 30
10
+ drop_duplicates:
11
+ - question
12
+ - block_type: LLMBlock
13
+ block_config:
14
+ block_name: eval_questions
15
+ config_path: configs/skills/evaluate_freeform_questions.yaml
16
+ model_id: meta-llama/Llama-3.3-70B-Instruct
17
+ output_cols:
18
+ - evaluation
19
+ - score
20
+ - block_type: FilterByValueBlock
21
+ block_config:
22
+ block_name: filter_questions
23
+ filter_column: score
24
+ filter_value: 1.0
25
+ operation: operator.eq
26
+ convert_dtype: float
27
+ batch_kwargs:
28
+ num_procs: 8
29
+ drop_columns:
30
+ - evaluation
31
+ - score
32
+ - num_samples
33
+ - block_type: LLMBlock
34
+ block_config:
35
+ block_name: gen_responses
36
+ config_path: configs/skills/freeform_responses.yaml
37
+ model_id: meta-llama/Llama-3.3-70B-Instruct
38
+ output_cols:
39
+ - response
40
+ - block_type: LLMBlock
41
+ block_config:
42
+ block_name: evaluate_qa_pair
43
+ config_path: configs/skills/evaluate_freeform_pair.yaml
44
+ model_id: meta-llama/Llama-3.3-70B-Instruct
45
+ output_cols:
46
+ - evaluation
47
+ - score
48
+ - block_type: FilterByValueBlock
49
+ block_config:
50
+ block_name: filter_qa_pair
51
+ filter_column: score
52
+ filter_value: 2.0
53
+ operation: operator.ge
54
+ convert_dtype: float
55
+ batch_kwargs:
56
+ num_procs: 8
57
+ drop_columns:
58
+ - evaluation
59
+ - score
@@ -0,0 +1,20 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # Standard
3
+ import os
4
+ import logging
5
+
6
+ # Third Party
7
+ from rich.logging import RichHandler
8
+
9
+
10
+ def setup_logger(name):
11
+ # Set up the logger
12
+ log_level = os.getenv("LOG_LEVEL", "INFO")
13
+ logging.basicConfig(
14
+ level=log_level,
15
+ format="%(message)s",
16
+ datefmt="[%X]",
17
+ handlers=[RichHandler()],
18
+ )
19
+ logger = logging.getLogger(name)
20
+ return logger
sdg_hub/pipeline.py ADDED
@@ -0,0 +1,121 @@
1
+ """
2
+ Deprecated Pipeline class for data generation pipelines.
3
+
4
+ Use the Flow class directly for new code.
5
+ """
6
+
7
+ # SPDX-License-Identifier: Apache-2.0
8
+ # Standard
9
+ import warnings
10
+ from typing import List, Dict, Any
11
+
12
+ # Third Party
13
+ from datasets import Dataset
14
+ from datasets.data_files import EmptyDatasetError
15
+
16
+ # Local
17
+ from .logger_config import setup_logger
18
+
19
+ logger = setup_logger(__name__)
20
+
21
+
22
+ class Pipeline:
23
+ """A class representing a data generation pipeline.
24
+
25
+ This class is deprecated and will be removed in a future version.
26
+ Use the Flow class directly instead.
27
+
28
+ Parameters
29
+ ----------
30
+ chained_blocks : List[Dict[str, Any]]
31
+ List of block configurations to execute in sequence.
32
+
33
+ Attributes
34
+ ----------
35
+ chained_blocks : List[Dict[str, Any]]
36
+ List of block configurations to execute in sequence.
37
+ """
38
+
39
+ def __init__(self, chained_blocks: List[Dict[str, Any]]) -> None:
40
+ """
41
+ Initialize the Pipeline class with a configuration dictionary.
42
+
43
+ DEPRECATED: This class is deprecated and will be removed in a future version.
44
+ Use the Flow class directly instead.
45
+ """
46
+ warnings.warn(
47
+ "Pipeline class is deprecated and will be removed in a future version. "
48
+ "Use Flow class directly instead of wrapping it with Pipeline.",
49
+ DeprecationWarning,
50
+ stacklevel=2
51
+ )
52
+ # pipeline config is the run configuration that consists of the pipeline steps
53
+ self.chained_blocks = chained_blocks
54
+
55
+ def _drop_duplicates(self, dataset: Dataset, cols: List[str]) -> Dataset:
56
+ """Drop duplicates from the dataset based on the columns provided.
57
+
58
+ Parameters
59
+ ----------
60
+ dataset : Dataset
61
+ The input dataset.
62
+ cols : List[str]
63
+ Columns to consider for duplicate detection.
64
+
65
+ Returns
66
+ -------
67
+ Dataset
68
+ Dataset with duplicates removed.
69
+ """
70
+ df = dataset.to_pandas()
71
+ df = df.drop_duplicates(subset=cols).reset_index(drop=True)
72
+ return Dataset.from_pandas(df)
73
+
74
+ def generate(self, dataset: Dataset) -> Dataset:
75
+ """Generate the dataset by running the pipeline steps.
76
+
77
+ Parameters
78
+ ----------
79
+ dataset : Dataset
80
+ The input dataset to process.
81
+
82
+ Returns
83
+ -------
84
+ Dataset
85
+ The processed dataset.
86
+
87
+ Raises
88
+ ------
89
+ EmptyDatasetError
90
+ If a block produces an empty dataset.
91
+ """
92
+ for block_prop in self.chained_blocks:
93
+ block_type = block_prop["block_type"]
94
+ block_config = block_prop["block_config"]
95
+ drop_columns = block_prop.get("drop_columns", [])
96
+ gen_kwargs = block_prop.get("gen_kwargs", {})
97
+ drop_duplicates_cols = block_prop.get("drop_duplicates", False)
98
+ block = block_type(**block_config)
99
+
100
+ logger.debug("------------------------------------\n")
101
+ logger.debug("Running block: %s", block_config["block_name"])
102
+ logger.debug("Input dataset: %s", dataset)
103
+
104
+ dataset = block.generate(dataset, **gen_kwargs)
105
+
106
+ if len(dataset) == 0:
107
+ raise EmptyDatasetError(
108
+ f"Pipeline stopped: Empty dataset after running block: {block_config['block_name']}"
109
+ )
110
+
111
+ drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names]
112
+ if drop_columns:
113
+ dataset = dataset.remove_columns(drop_columns_in_ds)
114
+
115
+ if drop_duplicates_cols:
116
+ dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
117
+
118
+ logger.debug("Output dataset: %s", dataset)
119
+ logger.debug("------------------------------------\n\n")
120
+
121
+ return dataset
sdg_hub/prompts.py ADDED
@@ -0,0 +1,43 @@
1
+ # Local
2
+ from .registry import PromptRegistry
3
+
4
+
5
+ @PromptRegistry.register("blank")
6
+ def blank_chat_template():
7
+ return """{{ messages }}"""
8
+
9
+
10
+ @PromptRegistry.register("instructlab")
11
+ def instructlab_chat_template():
12
+ return """{% for message in messages %}{% if message['role'] == 'pretraining' %}{{ '<|pretrain|>' + message['content'] + '<|endoftext|>' + '<|/pretrain|>' }}{% elif message['role'] == 'system' %}{{ '<|system|>' + '\n' + message['content'] + '\n' }}{% elif message['role'] == 'user' %}{{ '<|user|>' + '\n' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n') }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' + '\n' }}{% endif %}{% endfor %}"""
13
+
14
+
15
+ @PromptRegistry.register("mistralai")
16
+ def mistral_chat_template():
17
+ return """{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n<s>\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + '</s>'}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n"""
18
+
19
+
20
+ @PromptRegistry.register("meta-llama/Llama-3.3")
21
+ def meta_llama_chat_template():
22
+ return """{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n"""
23
+
24
+
25
+ @PromptRegistry.register("microsoft/phi-4")
26
+ def microsoft_phi_chat_template():
27
+ return """{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>' + message['content'] + '<|im_end|>'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}"""
28
+
29
+ @PromptRegistry.register("nvidia/Llama-3_3-Nemotron-Super-49B-v1")
30
+ def nemotron_chat_template():
31
+ return """{{- bos_token }}
32
+ {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}detailed thinking on{{- "<|eot_id|>" }}
33
+ {%- for message in messages %}
34
+ {%- if message['role'] == 'assistant' and '</think>' in message['content'] %}
35
+ {%- set content = message['content'].split('</think>')[-1].lstrip() %}
36
+ {%- else %}
37
+ {%- set content = message['content'] %}
38
+ {%- endif %}
39
+ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + content | trim + '<|eot_id|>' }}
40
+ {%- endfor %}
41
+ {%- if add_generation_prompt %}
42
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
43
+ {%- endif %}"""
sdg_hub/py.typed ADDED
File without changes