sdg-hub 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. sdg_hub/__init__.py +3 -0
  2. sdg_hub/_version.py +21 -0
  3. sdg_hub/blocks/__init__.py +36 -0
  4. sdg_hub/blocks/block.py +96 -0
  5. sdg_hub/blocks/llmblock.py +375 -0
  6. sdg_hub/blocks/utilblocks.py +597 -0
  7. sdg_hub/checkpointer.py +139 -0
  8. sdg_hub/configs/__init__.py +0 -0
  9. sdg_hub/configs/annotations/__init__.py +0 -0
  10. sdg_hub/configs/annotations/cot_reflection.yaml +34 -0
  11. sdg_hub/configs/annotations/detailed_annotations.yaml +28 -0
  12. sdg_hub/configs/annotations/detailed_description.yaml +10 -0
  13. sdg_hub/configs/annotations/detailed_description_icl.yaml +32 -0
  14. sdg_hub/configs/annotations/simple_annotations.yaml +9 -0
  15. sdg_hub/configs/knowledge/__init__.py +0 -0
  16. sdg_hub/configs/knowledge/atomic_facts.yaml +45 -0
  17. sdg_hub/configs/knowledge/auxilary_instructions.yaml +35 -0
  18. sdg_hub/configs/knowledge/detailed_summary.yaml +17 -0
  19. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +68 -0
  20. sdg_hub/configs/knowledge/evaluate_question.yaml +38 -0
  21. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +85 -0
  22. sdg_hub/configs/knowledge/extractive_summary.yaml +17 -0
  23. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +39 -0
  24. sdg_hub/configs/knowledge/generate_questions_responses.yaml +56 -0
  25. sdg_hub/configs/knowledge/mcq_generation.yaml +83 -0
  26. sdg_hub/configs/knowledge/router.yaml +12 -0
  27. sdg_hub/configs/knowledge/simple_generate_qa.yaml +34 -0
  28. sdg_hub/configs/reasoning/__init__.py +0 -0
  29. sdg_hub/configs/reasoning/dynamic_cot.yaml +40 -0
  30. sdg_hub/configs/skills/__init__.py +0 -0
  31. sdg_hub/configs/skills/analyzer.yaml +48 -0
  32. sdg_hub/configs/skills/annotation.yaml +36 -0
  33. sdg_hub/configs/skills/contexts.yaml +28 -0
  34. sdg_hub/configs/skills/critic.yaml +60 -0
  35. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +111 -0
  36. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +78 -0
  37. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +119 -0
  38. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +51 -0
  39. sdg_hub/configs/skills/freeform_questions.yaml +34 -0
  40. sdg_hub/configs/skills/freeform_responses.yaml +39 -0
  41. sdg_hub/configs/skills/grounded_questions.yaml +38 -0
  42. sdg_hub/configs/skills/grounded_responses.yaml +59 -0
  43. sdg_hub/configs/skills/icl_examples/STEM.yaml +56 -0
  44. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  45. sdg_hub/configs/skills/icl_examples/coding.yaml +97 -0
  46. sdg_hub/configs/skills/icl_examples/extraction.yaml +36 -0
  47. sdg_hub/configs/skills/icl_examples/humanities.yaml +71 -0
  48. sdg_hub/configs/skills/icl_examples/math.yaml +85 -0
  49. sdg_hub/configs/skills/icl_examples/reasoning.yaml +30 -0
  50. sdg_hub/configs/skills/icl_examples/roleplay.yaml +45 -0
  51. sdg_hub/configs/skills/icl_examples/writing.yaml +80 -0
  52. sdg_hub/configs/skills/judge.yaml +53 -0
  53. sdg_hub/configs/skills/planner.yaml +67 -0
  54. sdg_hub/configs/skills/respond.yaml +8 -0
  55. sdg_hub/configs/skills/revised_responder.yaml +78 -0
  56. sdg_hub/configs/skills/router.yaml +59 -0
  57. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +27 -0
  58. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +31 -0
  59. sdg_hub/flow.py +306 -0
  60. sdg_hub/flow_runner.py +204 -0
  61. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +13 -0
  62. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +12 -0
  63. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +89 -0
  64. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +136 -0
  65. sdg_hub/flows/generation/skills/improve_responses.yaml +103 -0
  66. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +12 -0
  67. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +12 -0
  68. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +80 -0
  69. sdg_hub/flows/generation/skills/synth_skills.yaml +59 -0
  70. sdg_hub/logger_config.py +20 -0
  71. sdg_hub/pipeline.py +121 -0
  72. sdg_hub/prompts.py +43 -0
  73. sdg_hub/py.typed +0 -0
  74. sdg_hub/registry.py +122 -0
  75. sdg_hub/sdg.py +206 -0
  76. sdg_hub/utils/__init__.py +5 -0
  77. sdg_hub/utils/datautils.py +14 -0
  78. sdg_hub-0.1.0.dist-info/METADATA +190 -0
  79. sdg_hub-0.1.0.dist-info/RECORD +82 -0
  80. sdg_hub-0.1.0.dist-info/WHEEL +5 -0
  81. sdg_hub-0.1.0.dist-info/licenses/LICENSE +201 -0
  82. sdg_hub-0.1.0.dist-info/top_level.txt +1 -0
sdg_hub/flow.py ADDED
@@ -0,0 +1,306 @@
1
+ """
2
+ Flow module for managing data generation pipelines.
3
+
4
+ This module provides the core Flow class that handles both configuration loading and execution
5
+ of data generation blocks. The Flow class serves as the main interface for defining and running
6
+ data generation pipelines, supporting both direct usage with SDG and backward compatibility
7
+ through the deprecated Pipeline class.
8
+
9
+ Example:
10
+ >>> flow = Flow(llm_client)
11
+ >>> flow = flow.get_flow_from_file("path/to/flow.yaml")
12
+ >>> dataset = flow.generate(input_dataset)
13
+
14
+ Note:
15
+ This module is part of the SDG Hub package and is designed to work in conjunction
16
+ with the SDG class for distributed data generation.
17
+ """
18
+
19
+ # SPDX-License-Identifier: Apache-2.0
20
+ # Standard
21
+ from abc import ABC
22
+ from importlib import resources
23
+ from typing import Optional, List, Dict, Any, Callable
24
+ import operator
25
+ import os
26
+
27
+ # Third Party
28
+ import yaml
29
+ from datasets import Dataset
30
+ from datasets.data_files import EmptyDatasetError
31
+
32
+ # Local
33
+ from .blocks import * # needed to register blocks
34
+ from .prompts import * # needed to register prompts
35
+ from .registry import BlockRegistry, PromptRegistry
36
+ from .logger_config import setup_logger
37
+
38
+
39
+ logger = setup_logger(__name__)
40
+
41
+ OPERATOR_MAP: Dict[str, Callable] = {
42
+ "operator.eq": operator.eq,
43
+ "operator.ge": operator.ge,
44
+ "operator.le": operator.le,
45
+ "operator.gt": operator.gt,
46
+ "operator.lt": operator.lt,
47
+ "operator.ne": operator.ne,
48
+ "operator.contains": operator.contains,
49
+ }
50
+
51
+ CONVERT_DTYPE_MAP: Dict[str, Callable] = {
52
+ "float": float,
53
+ "int": int,
54
+ }
55
+
56
+
57
+ class Flow(ABC):
58
+ """A class representing a data generation flow.
59
+
60
+ This class handles both configuration loading and execution of data generation
61
+ blocks. It can be used directly with SDG or through the deprecated Pipeline class.
62
+ """
63
+
64
+ def __init__(
65
+ self,
66
+ llm_client: Any,
67
+ num_samples_to_generate: Optional[int] = None,
68
+ ) -> None:
69
+ """
70
+ Initialize the Flow class.
71
+
72
+ Parameters
73
+ ----------
74
+ llm_client : Any
75
+ The LLM client to use for generation.
76
+ num_samples_to_generate : Optional[int], optional
77
+ Number of samples to generate, by default None
78
+
79
+ Attributes
80
+ ----------
81
+ llm_client : Any
82
+ The LLM client instance.
83
+ base_path : str
84
+ Base path for resource files.
85
+ registered_blocks : Dict[str, Any]
86
+ Registry of available blocks.
87
+ chained_blocks : Optional[List[Dict[str, Any]]]
88
+ List of block configurations.
89
+ num_samples_to_generate : Optional[int]
90
+ Number of samples to generate.
91
+
92
+ """
93
+ self.llm_client = llm_client
94
+ self.base_path = str(resources.files(__package__))
95
+ self.registered_blocks = BlockRegistry.get_registry()
96
+ self.chained_blocks = None # Will be set by get_flow_from_file
97
+ self.num_samples_to_generate = num_samples_to_generate
98
+
99
+ def _getFilePath(self, dirs: List[str], filename: str) -> str:
100
+ """Find a named configuration file.
101
+
102
+ Files are checked in the following order:
103
+ 1. Absolute path is always used
104
+ 2. Checked relative to the directories in "dirs"
105
+ 3. Relative to the current directory
106
+
107
+ Parameters
108
+ ----------
109
+ dirs : List[str]
110
+ Directories in which to search for the file.
111
+ filename : str
112
+ The path to the configuration file.
113
+
114
+ Returns
115
+ -------
116
+ str
117
+ Selected file path.
118
+ """
119
+ if os.path.isabs(filename):
120
+ return filename
121
+ for d in dirs:
122
+ full_file_path = os.path.join(d, filename)
123
+ if os.path.isfile(full_file_path):
124
+ return full_file_path
125
+ # If not found above then return the path unchanged i.e.
126
+ # assume the path is relative to the current directory
127
+ return filename
128
+
129
+ def _drop_duplicates(self, dataset: Dataset, cols: List[str]) -> Dataset:
130
+ """Drop duplicates from the dataset based on the columns provided.
131
+
132
+ Parameters
133
+ ----------
134
+ dataset : Dataset
135
+ The input dataset.
136
+ cols : List[str]
137
+ Columns to consider for duplicate detection.
138
+
139
+ Returns
140
+ -------
141
+ Dataset
142
+ Dataset with duplicates removed.
143
+ """
144
+ df = dataset.to_pandas()
145
+ df = df.drop_duplicates(subset=cols).reset_index(drop=True)
146
+ return Dataset.from_pandas(df)
147
+
148
+ def generate(self, dataset: Dataset) -> Dataset:
149
+ """Generate the dataset by running the pipeline steps.
150
+
151
+ Parameters
152
+ ----------
153
+ dataset : Dataset
154
+ The input dataset to process.
155
+
156
+ Returns
157
+ -------
158
+ Dataset
159
+ The processed dataset.
160
+
161
+ Raises
162
+ ------
163
+ ValueError
164
+ If Flow has not been initialized with blocks.
165
+ EmptyDatasetError
166
+ If a block produces an empty dataset.
167
+ """
168
+ if self.chained_blocks is None:
169
+ raise ValueError(
170
+ "Flow has not been initialized with blocks. "
171
+ "Call get_flow_from_file() first. "
172
+ "Or pass a list of blocks to the Flow constructor."
173
+ )
174
+
175
+ for block_prop in self.chained_blocks:
176
+ block_type = block_prop["block_type"]
177
+ block_config = block_prop["block_config"]
178
+ drop_columns = block_prop.get("drop_columns", [])
179
+ gen_kwargs = block_prop.get("gen_kwargs", {})
180
+ drop_duplicates_cols = block_prop.get("drop_duplicates", False)
181
+ block = block_type(**block_config)
182
+
183
+ logger.debug("------------------------------------\n")
184
+ logger.debug("Running block: %s", block_config["block_name"])
185
+ logger.debug("Input dataset: %s", dataset)
186
+
187
+ dataset = block.generate(dataset, **gen_kwargs)
188
+
189
+ if len(dataset) == 0:
190
+ raise EmptyDatasetError(
191
+ f"Pipeline stopped: "
192
+ f"Empty dataset after running block: "
193
+ f"{block_config['block_name']}"
194
+ )
195
+
196
+ drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names]
197
+ if drop_columns:
198
+ dataset = dataset.remove_columns(drop_columns_in_ds)
199
+
200
+ if drop_duplicates_cols:
201
+ dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
202
+
203
+ logger.debug("Output dataset: %s", dataset)
204
+ logger.debug("------------------------------------\n\n")
205
+
206
+ return dataset
207
+
208
+ def get_flow_from_file(self, yaml_path: str) -> "Flow":
209
+ """Load and initialize flow configuration from a YAML file.
210
+
211
+ Parameters
212
+ ----------
213
+ yaml_path : str
214
+ Path to the YAML configuration file.
215
+
216
+ Returns
217
+ -------
218
+ Flow
219
+ Self with initialized chained_blocks.
220
+
221
+ Raises
222
+ ------
223
+ FileNotFoundError
224
+ If the YAML file cannot be found.
225
+ KeyError
226
+ If a required block or prompt is not found in the registry.
227
+ """
228
+ yaml_path_relative_to_base = os.path.join(self.base_path, yaml_path)
229
+ if os.path.isfile(yaml_path_relative_to_base):
230
+ yaml_path = yaml_path_relative_to_base
231
+ yaml_dir = os.path.dirname(yaml_path)
232
+
233
+ try:
234
+ with open(yaml_path, "r", encoding="utf-8") as yaml_file:
235
+ flow = yaml.safe_load(yaml_file)
236
+ except FileNotFoundError as exc:
237
+ raise FileNotFoundError(f"File not found: {yaml_path}") from exc
238
+
239
+ # update config with class instances
240
+ for block in flow:
241
+ # check if theres an llm block in the flow
242
+ if "LLM" in block["block_type"]:
243
+ block["block_config"]["client"] = self.llm_client
244
+ # model_id and prompt templates
245
+ # try to get a template using the model_id, but if model_prompt_template is provided, use that
246
+ if block["block_config"].get("model_prompt", None) is None:
247
+ # try to find a match in the registry
248
+ matched_prompt = next(
249
+ (
250
+ key
251
+ for key in PromptRegistry.get_registry()
252
+ if key in block["block_config"]["model_id"]
253
+ ),
254
+ None,
255
+ )
256
+ if matched_prompt is not None:
257
+ block["block_config"]["model_prompt"] = matched_prompt
258
+ else:
259
+ raise KeyError(
260
+ f"Prompt not found in registry: {block['block_config']['model_id']}"
261
+ )
262
+
263
+ if self.num_samples_to_generate is not None:
264
+ block["num_samples"] = self.num_samples_to_generate
265
+
266
+ # update block type to llm class instance
267
+ try:
268
+ block["block_type"] = self.registered_blocks[block["block_type"]]
269
+ except KeyError as exc:
270
+ raise KeyError(
271
+ f"Block not found in registry: {block['block_type']}"
272
+ ) from exc
273
+
274
+ # update config path to absolute path
275
+ if "config_path" in block["block_config"]:
276
+ block["block_config"]["config_path"] = self._getFilePath(
277
+ [yaml_dir, self.base_path], block["block_config"]["config_path"]
278
+ )
279
+
280
+ # update config paths to absolute paths - this might be a list or a dict
281
+ if "config_paths" in block["block_config"]:
282
+ if isinstance(block["block_config"]["config_paths"], dict):
283
+ for key, path in block["block_config"]["config_paths"].items():
284
+ block["block_config"]["config_paths"][key] = self._getFilePath(
285
+ [yaml_dir, self.base_path], path
286
+ )
287
+
288
+ elif isinstance(block["block_config"]["config_paths"], list):
289
+ for i, path in enumerate(block["block_config"]["config_paths"]):
290
+ block["block_config"]["config_paths"][i] = self._getFilePath(
291
+ [yaml_dir, self.base_path], path
292
+ )
293
+
294
+ if "operation" in block["block_config"]:
295
+ block["block_config"]["operation"] = OPERATOR_MAP[
296
+ block["block_config"]["operation"]
297
+ ]
298
+
299
+ if "convert_dtype" in block["block_config"]:
300
+ block["block_config"]["convert_dtype"] = CONVERT_DTYPE_MAP[
301
+ block["block_config"]["convert_dtype"]
302
+ ]
303
+
304
+ # Store the chained blocks and return self
305
+ self.chained_blocks = flow
306
+ return self
sdg_hub/flow_runner.py ADDED
@@ -0,0 +1,204 @@
1
+ """Script for running data generation flows with configurable parameters."""
2
+
3
+ # Standard
4
+ import os
5
+
6
+ # Third Party
7
+ from datasets import load_dataset
8
+ from openai import OpenAI
9
+ import click
10
+
11
+ # First Party
12
+ from sdg_hub.flow import Flow
13
+ from sdg_hub.logger_config import setup_logger
14
+ from sdg_hub.sdg import SDG
15
+
16
+
17
+ logger = setup_logger(__name__)
18
+
19
+
20
+ def run_flow(
21
+ ds_path: str,
22
+ save_path: str,
23
+ endpoint: str,
24
+ flow_path: str,
25
+ checkpoint_dir: str,
26
+ batch_size: int = 8,
27
+ num_workers: int = 32,
28
+ save_freq: int = 2,
29
+ debug: bool = False,
30
+ ) -> None:
31
+ """Process the dataset using the specified configuration.
32
+
33
+ Parameters
34
+ ----------
35
+ ds_path : str
36
+ Path to the dataset file.
37
+ save_path : str
38
+ Path where the output will be saved.
39
+ endpoint : str
40
+ API endpoint for data processing.
41
+ flow_path : str
42
+ Path to the flow configuration file.
43
+ checkpoint_dir : str
44
+ Directory path for saving checkpoints.
45
+ batch_size : int, optional
46
+ Batch size for processing, by default 8.
47
+ num_workers : int, optional
48
+ Number of worker processes to use, by default 32.
49
+ save_freq : int, optional
50
+ Frequency (in batches) at which to save checkpoints, by default 2.
51
+ debug : bool, optional
52
+ If True, enables debug mode with a smaller dataset subset, by default False.
53
+
54
+ Returns
55
+ -------
56
+ None
57
+
58
+ Raises
59
+ ------
60
+ FileNotFoundError
61
+ If the flow configuration file is not found.
62
+ """
63
+ logger.info(f"Generation configuration: {locals()}\n\n")
64
+ ds = load_dataset("json", data_files=ds_path, split="train")
65
+
66
+ if debug:
67
+ ds = ds.shuffle(seed=42).select(range(30))
68
+ logger.info("Debug mode enabled. Using a subset of the dataset.")
69
+
70
+ openai_api_key = os.environ.get("OPENAI_API_KEY", "EMPTY")
71
+ openai_api_base = endpoint
72
+
73
+ client = OpenAI(
74
+ api_key=openai_api_key,
75
+ base_url=openai_api_base,
76
+ )
77
+
78
+ if not os.path.exists(flow_path):
79
+ raise FileNotFoundError(f"Flow file not found: {flow_path}")
80
+
81
+ flow = Flow(client).get_flow_from_file(flow_path)
82
+ sdg = SDG(
83
+ flows=[flow],
84
+ num_workers=num_workers,
85
+ batch_size=batch_size,
86
+ save_freq=save_freq,
87
+ )
88
+
89
+ generated_data = sdg.generate(ds, checkpoint_dir=checkpoint_dir)
90
+ generated_data.to_json(save_path, orient="records", lines=True)
91
+ logger.info(f"Data saved to {save_path}")
92
+
93
+
94
+ @click.command()
95
+ @click.option(
96
+ "--ds_path",
97
+ type=click.Path(exists=True),
98
+ required=True,
99
+ help="Path to the dataset.",
100
+ )
101
+ @click.option(
102
+ "--bs",
103
+ type=int,
104
+ default=8,
105
+ show_default=True,
106
+ help="Batch size for processing.",
107
+ )
108
+ @click.option(
109
+ "--num_workers",
110
+ type=int,
111
+ default=32,
112
+ show_default=True,
113
+ help="Number of worker processes to use.",
114
+ )
115
+ @click.option(
116
+ "--save_path",
117
+ type=click.Path(),
118
+ required=True,
119
+ help="Path to save the output.",
120
+ )
121
+ @click.option(
122
+ "--endpoint",
123
+ type=str,
124
+ required=True,
125
+ help="API endpoint for data processing.",
126
+ )
127
+ @click.option(
128
+ "--flow",
129
+ type=click.Path(exists=True),
130
+ required=True,
131
+ help="Flow configuration for the process.",
132
+ )
133
+ @click.option(
134
+ "--checkpoint_dir",
135
+ type=click.Path(),
136
+ required=True,
137
+ help="Path to save checkpoints.",
138
+ )
139
+ @click.option(
140
+ "--save_freq",
141
+ type=int,
142
+ default=2,
143
+ show_default=True,
144
+ help="Frequency to save checkpoints.",
145
+ )
146
+ @click.option(
147
+ "--debug",
148
+ is_flag=True,
149
+ help="Enable debug mode with a smaller dataset subset.",
150
+ )
151
+ def main(
152
+ ds_path: str,
153
+ bs: int,
154
+ num_workers: int,
155
+ save_path: str,
156
+ endpoint: str,
157
+ flow: str,
158
+ checkpoint_dir: str,
159
+ save_freq: int,
160
+ debug: bool,
161
+ ) -> None:
162
+ """CLI entry point for running data generation flows.
163
+
164
+ Parameters
165
+ ----------
166
+ ds_path : str
167
+ Path to the dataset file.
168
+ bs : int
169
+ Batch size for processing.
170
+ num_workers : int
171
+ Number of worker processes to use.
172
+ save_path : str
173
+ Path where the output will be saved.
174
+ endpoint : str
175
+ API endpoint for data processing.
176
+ flow : str
177
+ Path to the flow configuration file.
178
+ checkpoint_dir : str
179
+ Directory path for saving checkpoints.
180
+ save_freq : int
181
+ Frequency (in batches) at which to save checkpoints.
182
+ debug : bool
183
+ If True, enables debug mode with a smaller dataset subset.
184
+
185
+ Returns
186
+ -------
187
+ None
188
+ """
189
+ run_flow(
190
+ ds_path=ds_path,
191
+ batch_size=bs,
192
+ num_workers=num_workers,
193
+ save_path=save_path,
194
+ endpoint=endpoint,
195
+ flow_path=flow,
196
+ checkpoint_dir=checkpoint_dir,
197
+ save_freq=save_freq,
198
+ debug=debug,
199
+ )
200
+
201
+
202
+ if __name__ == "__main__":
203
+ # pylint: disable=no-value-for-parameter
204
+ main()
@@ -0,0 +1,13 @@
1
+ - block_type: LLMBlock
2
+ block_config:
3
+ block_name: gen_mmlu_knowledge
4
+ config_path: configs/knowledge/mcq_generation.yaml
5
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
6
+ output_cols:
7
+ - mmlubench_question
8
+ - mmlubench_answer
9
+ gen_kwargs:
10
+ temperature: 0
11
+ max_tokens: 2048
12
+ drop_duplicates:
13
+ - mmlubench_question
@@ -0,0 +1,12 @@
1
+ - block_type: LLMBlock
2
+ block_config:
3
+ block_name: gen_knowledge
4
+ config_path: configs/knowledge/simple_generate_qa.yaml
5
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
6
+ output_cols:
7
+ - output
8
+ gen_kwargs:
9
+ temperature: 0.7
10
+ max_tokens: 2048
11
+ drop_duplicates:
12
+ - output
@@ -0,0 +1,89 @@
1
+ - block_type: LLMBlock
2
+ block_config:
3
+ block_name: gen_knowledge
4
+ config_path: configs/knowledge/generate_questions_responses.yaml
5
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
6
+ output_cols:
7
+ - question
8
+ - response
9
+ parser_kwargs:
10
+ parser_name: custom
11
+ parsing_pattern: "\\[(?:Question|QUESTION)\\]\\s*(.*?)\\s*\\[(?:Answer|ANSWER)\\]\\s*(.*?)\\s*(?=\\[(?:Question|QUESTION)\\]|$)"
12
+ parser_cleanup_tags:
13
+ - "[END]"
14
+ gen_kwargs:
15
+ max_tokens: 2048
16
+ drop_duplicates:
17
+ - question
18
+
19
+ - block_type: LLMBlock
20
+ block_config:
21
+ block_name: eval_faithfulness_qa_pair
22
+ config_path: configs/knowledge/evaluate_faithfulness.yaml
23
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
24
+ output_cols:
25
+ - explanation
26
+ - judgment
27
+ gen_kwargs:
28
+ max_tokens: 2048
29
+
30
+ - block_type: FilterByValueBlock
31
+ block_config:
32
+ block_name: filter_faithfulness
33
+ filter_column: judgment
34
+ filter_value: "YES"
35
+ operation: operator.eq
36
+ batch_kwargs:
37
+ num_procs: 8
38
+ drop_columns:
39
+ - judgment
40
+ - explanation
41
+
42
+ - block_type: LLMBlock
43
+ block_config:
44
+ block_name: eval_relevancy_qa_pair
45
+ config_path: configs/knowledge/evaluate_relevancy.yaml
46
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
47
+ output_cols:
48
+ - feedback
49
+ - score
50
+ gen_kwargs:
51
+ max_tokens: 2048
52
+
53
+ - block_type: FilterByValueBlock
54
+ block_config:
55
+ block_name: filter_relevancy
56
+ filter_column: score
57
+ filter_value: 2.0
58
+ operation: operator.eq
59
+ convert_dtype: float
60
+ batch_kwargs:
61
+ num_procs: 8
62
+ drop_columns:
63
+ - feedback
64
+ - score
65
+
66
+ - block_type: LLMBlock
67
+ block_config:
68
+ block_name: eval_verify_question
69
+ config_path: configs/knowledge/evaluate_question.yaml
70
+ model_id: mistralai/Mixtral-8x7B-Instruct-v0.1
71
+ output_cols:
72
+ - explanation
73
+ - rating
74
+ gen_kwargs:
75
+ max_tokens: 2048
76
+
77
+ - block_type: FilterByValueBlock
78
+ block_config:
79
+ block_name: filter_verify_question
80
+ filter_column: rating
81
+ filter_value: 1.0
82
+ operation: operator.eq
83
+ convert_dtype: float
84
+ batch_kwargs:
85
+ num_procs: 8
86
+ drop_columns:
87
+ - explanation
88
+ - rating
89
+ - __index_level_0__