sdg-hub 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. sdg_hub/__init__.py +28 -1
  2. sdg_hub/_version.py +2 -2
  3. sdg_hub/core/__init__.py +22 -0
  4. sdg_hub/core/blocks/__init__.py +58 -0
  5. sdg_hub/core/blocks/base.py +313 -0
  6. sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
  7. sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
  8. sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
  9. sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
  10. sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
  11. sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
  12. sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
  13. sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
  14. sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
  15. sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
  16. sdg_hub/core/blocks/evaluation/__init__.py +9 -0
  17. sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
  18. sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
  19. sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
  20. sdg_hub/core/blocks/filtering/__init__.py +12 -0
  21. sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
  22. sdg_hub/core/blocks/llm/__init__.py +25 -0
  23. sdg_hub/core/blocks/llm/client_manager.py +398 -0
  24. sdg_hub/core/blocks/llm/config.py +336 -0
  25. sdg_hub/core/blocks/llm/error_handler.py +368 -0
  26. sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
  27. sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
  28. sdg_hub/core/blocks/llm/text_parser_block.py +310 -0
  29. sdg_hub/core/blocks/registry.py +331 -0
  30. sdg_hub/core/blocks/transform/__init__.py +23 -0
  31. sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
  32. sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
  33. sdg_hub/core/blocks/transform/melt_columns.py +126 -0
  34. sdg_hub/core/blocks/transform/rename_columns.py +69 -0
  35. sdg_hub/core/blocks/transform/text_concat.py +102 -0
  36. sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
  37. sdg_hub/core/flow/__init__.py +20 -0
  38. sdg_hub/core/flow/base.py +980 -0
  39. sdg_hub/core/flow/metadata.py +344 -0
  40. sdg_hub/core/flow/migration.py +187 -0
  41. sdg_hub/core/flow/registry.py +330 -0
  42. sdg_hub/core/flow/validation.py +265 -0
  43. sdg_hub/{utils → core/utils}/__init__.py +6 -4
  44. sdg_hub/{utils → core/utils}/datautils.py +1 -3
  45. sdg_hub/core/utils/error_handling.py +208 -0
  46. sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
  47. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
  48. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
  49. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
  50. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
  51. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
  52. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
  53. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +191 -0
  54. sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
  55. sdg_hub-0.2.0.dist-info/METADATA +218 -0
  56. sdg_hub-0.2.0.dist-info/RECORD +63 -0
  57. sdg_hub/blocks/__init__.py +0 -42
  58. sdg_hub/blocks/block.py +0 -96
  59. sdg_hub/blocks/llmblock.py +0 -375
  60. sdg_hub/blocks/openaichatblock.py +0 -556
  61. sdg_hub/blocks/utilblocks.py +0 -597
  62. sdg_hub/checkpointer.py +0 -139
  63. sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
  64. sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
  65. sdg_hub/configs/annotations/detailed_description.yaml +0 -10
  66. sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
  67. sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
  68. sdg_hub/configs/knowledge/__init__.py +0 -0
  69. sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
  70. sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
  71. sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
  72. sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
  73. sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
  74. sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
  75. sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
  76. sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
  77. sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
  78. sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
  79. sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
  80. sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
  81. sdg_hub/configs/knowledge/router.yaml +0 -12
  82. sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
  83. sdg_hub/configs/reasoning/__init__.py +0 -0
  84. sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
  85. sdg_hub/configs/skills/__init__.py +0 -0
  86. sdg_hub/configs/skills/analyzer.yaml +0 -48
  87. sdg_hub/configs/skills/annotation.yaml +0 -36
  88. sdg_hub/configs/skills/contexts.yaml +0 -28
  89. sdg_hub/configs/skills/critic.yaml +0 -60
  90. sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
  91. sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
  92. sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
  93. sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
  94. sdg_hub/configs/skills/freeform_questions.yaml +0 -34
  95. sdg_hub/configs/skills/freeform_responses.yaml +0 -39
  96. sdg_hub/configs/skills/grounded_questions.yaml +0 -38
  97. sdg_hub/configs/skills/grounded_responses.yaml +0 -59
  98. sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
  99. sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
  100. sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
  101. sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
  102. sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
  103. sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
  104. sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
  105. sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
  106. sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
  107. sdg_hub/configs/skills/judge.yaml +0 -53
  108. sdg_hub/configs/skills/planner.yaml +0 -67
  109. sdg_hub/configs/skills/respond.yaml +0 -8
  110. sdg_hub/configs/skills/revised_responder.yaml +0 -78
  111. sdg_hub/configs/skills/router.yaml +0 -59
  112. sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
  113. sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
  114. sdg_hub/flow.py +0 -477
  115. sdg_hub/flow_runner.py +0 -450
  116. sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
  117. sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
  118. sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
  119. sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -148
  120. sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
  121. sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
  122. sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
  123. sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
  124. sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
  125. sdg_hub/pipeline.py +0 -121
  126. sdg_hub/prompts.py +0 -74
  127. sdg_hub/registry.py +0 -122
  128. sdg_hub/sdg.py +0 -206
  129. sdg_hub/utils/config_validation.py +0 -91
  130. sdg_hub/utils/error_handling.py +0 -94
  131. sdg_hub/utils/validation_result.py +0 -10
  132. sdg_hub-0.1.3.dist-info/METADATA +0 -190
  133. sdg_hub-0.1.3.dist-info/RECORD +0 -89
  134. sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
  135. /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
  136. /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
  137. {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/WHEEL +0 -0
  138. {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/licenses/LICENSE +0 -0
  139. {sdg_hub-0.1.3.dist-info → sdg_hub-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,78 +0,0 @@
1
- system: |
2
- You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
3
-
4
- introduction: |
5
- Your task is to revise the response to an user query. You will be given a query from an user and a response from a model. Your task is to provide a better response based on the plan given.
6
-
7
- principles: |
8
- * You will revise the model response according to the plan given.
9
- * The revised response should adhere to the plan and should be better than the original response.
10
- * Note that the revised response will be evaluated by a human expert and should thus be of high quality.
11
- * Do not have any irrelevant information in the revised response. Specifically do not include any self-referential information in the revised response.
12
- * Your response should only include the revised response. Please do not include any other information like the query, analysis, rubric, etc.
13
- * Your response will become invalid if it contains any meta-review about how you are revising the response. So please avoid including any such information.
14
- * If the plan mentions that there is no need to provide a plan for improvement, simply return the original response as the revised response.
15
- * Return the revised response between [Start of Revised Response] and [End of Revised Response] tags.
16
-
17
- examples: |
18
- To help you understand the task, here is an example:
19
-
20
- [Start of Query]
21
- {{ icl_query }}
22
- [End of Query]
23
-
24
- [Start of Response]
25
- {{ icl_response }}
26
- [End of Response]
27
-
28
- [Start of Analysis]
29
- {{ icl_analysis }}
30
- [End of Analysis]
31
-
32
- [Start of Rubric]
33
- {{ icl_rubric }}
34
- [End of Rubric]
35
-
36
- [Start of Critique]
37
- {{ icl_critique }}
38
- [End of Critique]
39
-
40
- [Start of Plan]
41
- {{ icl_plan }}
42
- [End of Plan]
43
-
44
- [Start of Revised Response]
45
- {{ icl_revised_response }}
46
- [End of Revised Response]
47
-
48
- generation: |
49
- Now it's your turn to revise the response to the following query. Remember to follow the paradigm and return the revised response in the respective section in the same format as above. Strictly do not include any meta-review or meta-information about how the response was improved or revised. Your response should only include the revised response. You will be heavily penalized if you include any information about the revision process or if you have any reference about how you revised the response.
50
-
51
- [Start of Query]
52
- {{ question }}
53
- [End of Query]
54
-
55
- [Start of Response]
56
- {{ response }}
57
- [End of Response]
58
-
59
- [Start of Analysis]
60
- {{ analysis }}
61
- [End of Analysis]
62
-
63
- [Start of Rubric]
64
- {{ rubric }}
65
- [End of Rubric]
66
-
67
- [Start of Critique]
68
- {{ critique }}
69
- [End of Critique]
70
-
71
- [Start of Plan]
72
- {{ plan }}
73
- [End of Plan]
74
-
75
- Start your response with the tag [Start of Revised Response] and end it with the tag [End of Revised Response].
76
-
77
- start_tags: ["[Start of Revised Response]"]
78
- end_tags: ["[End of Revised Response]"]
@@ -1,59 +0,0 @@
1
- system: |
2
- You are a skill classifier. Given a question or task, classify it into exactly one of the following categories:
3
- - coding: Questions about programming, software development, algorithms, or technical implementation.
4
- - extraction: Tasks that require identifying and pulling out specific pieces of information from text or data, without analysis or transformation. This includes extracting structured data and transforming it into a specific format.
5
- - humanities: Questions about economics, social sciences, literature, history, philosophy, or arts that require analysis, comparison, or evaluation.
6
- - math: Questions requiring mathematical calculations, proofs, probability, or numerical reasoning.
7
- - reasoning: Tasks requiring logical deduction, problem-solving, or analytical thinking.
8
- - roleplay: Scenarios requiring adopting a specific role, perspective, or character.
9
- - stem: Questions about science, technology, engineering, or mathematics that require explaining technical or scientific concepts.
10
- - writing: Tasks that require creating new content, compositions, or text generation.
11
-
12
- Only output the category name, and nothing else. Please do not output anything except one of these exact words.
13
-
14
- introduction: |
15
- Classify the given question into one of the predefined categories based on its primary focus and requirements.
16
-
17
- principles: |
18
- 1. Each question should be classified into exactly one category.
19
- 2. Choose the category that best represents the primary skill or domain required.
20
- 3. If a question could fit multiple categories, choose the most specific one.
21
- 4. Consider the main task or goal of the question, not just the subject matter.
22
- 5. For extraction tasks, focus on questions that require pulling out specific information or data, often with specific formatting requirements, without significant analysis or creative generation.
23
- 6. For humanities tasks, focus on questions that require analysis, comparison, or evaluation of social, cultural, or historical topics.
24
- 7. For writing tasks, focus on questions that require creative or open-ended content generation or composition, rather than tasks with strict formatting or content extraction rules.
25
-
26
- examples: |
27
- Question: Extract and categorize the health-related themes, such as 'symptoms', 'treatments', and 'prevention'
28
- Category: extraction
29
-
30
- Question: Summarize the story with three bullet points using only nouns and adjectives, without verbs.
31
- Category: extraction
32
-
33
- Question: Can you analyze the relationship between economic indicators like GDP, interest rates, and consumer behavior?
34
- Category: humanities
35
-
36
- Question: Calculate the area of a triangle with base 6 units and height 8 units
37
- Category: math
38
-
39
- Question: Suppose you are in a marathon and you just passed the person in third place. What position are you in?
40
- Category: reasoning
41
-
42
- Question: You are a medieval blacksmith. Describe your typical workday and the challenges you face
43
- Category: roleplay
44
-
45
- Question: In the realm of quantum mechanics, what is wave-particle duality, and how does it contribute to the understanding of quantum systems?
46
- Category: STEM
47
-
48
- Question: Write a travel blog post about exploring the ancient temples and street food scene in Bangkok, Thailand
49
- Category: writing
50
-
51
- generation: |
52
- Question: {{ question }}
53
-
54
- Based on the above question, classify it into exactly one of these categories: coding, extraction, humanities, math, reasoning, roleplay, STEM, or writing.
55
-
56
- Category:
57
-
58
- start_tags: [""]
59
- end_tags: [""]
@@ -1,27 +0,0 @@
1
- system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
2
-
3
- introduction: Develop a series of question and answer pairs to perform a task.
4
-
5
- principles: |
6
- Here are the requirements:
7
- 1. Try not to repeat the verb for each instruction to maximize diversity.
8
- 2. The language used for the instruction also should be diverse. For example, you should combine questions with imperative instructions.
9
- 3. The type of instructions should be similar to provided examples. The generated instruction and the output should be grounded in the provided document.
10
- 4. A GPT language model should be able to complete the instruction. For example, do not ask the assistant to create any visual or audio output. For another example, do not ask the assistant to wake you up at 5pm or set a reminder because it cannot perform any action.
11
- 5. The instructions should be in English.
12
- 6. The instructions should be 1 to 2 sentences long. Either an imperative sentence or a question is permitted.
13
- 7. The output should be an appropriate response to the input and the instruction. Long outputs are preferable.
14
-
15
- examples: |
16
- The task is {{task_description}}.
17
-
18
- Here is an example to help you understand the type of questions that are asked for:
19
-
20
- {{seed_question}}
21
- {{seed_response}}
22
-
23
- generation: |
24
- Provide a single question and answer pair based on the examples.
25
-
26
- start_tags: [""]
27
- end_tags: [""]
@@ -1,31 +0,0 @@
1
- system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
2
-
3
- introduction: Develop a series of question and answer pairs to perform a task.
4
-
5
- principles: |
6
- Here are the requirements:
7
- 1. Try not to repeat the verb for each instruction to maximize diversity.
8
- 2. The language used for the instruction also should be diverse. For example, you should combine questions with imperative instructions.
9
- 3. The type of instructions should be similar to provided examples. The generated instruction and the output should be grounded in the provided document.
10
- 4. A GPT language model should be able to complete the instruction. For example, do not ask the assistant to create any visual or audio output. For another example, do not ask the assistant to wake you up at 5pm or set a reminder because it cannot perform any action.
11
- 5. The instructions should be in English.
12
- 6. The instructions should be 1 to 2 sentences long. Either an imperative sentence or a question is permitted.
13
- 7. The output should be an appropriate response to the input and the instruction. Long outputs are preferable.
14
-
15
- examples: |
16
- The task is {{task_description}}.
17
-
18
- Here is some context for the example question:
19
-
20
- {{seed_context}}
21
-
22
- Here is an example to help you understand the type of questions that are asked for:
23
-
24
- {{seed_question}}
25
- {{seed_response}}
26
-
27
- generation: |
28
- Provide a single question and answer pair based on the example.
29
-
30
- start_tags: [""]
31
- end_tags: [""]
sdg_hub/flow.py DELETED
@@ -1,477 +0,0 @@
1
- """
2
- Flow module for managing data generation pipelines.
3
-
4
- This module provides the core Flow class that handles both configuration loading and execution
5
- of data generation blocks. The Flow class serves as the main interface for defining and running
6
- data generation pipelines, supporting both direct usage with SDG and backward compatibility
7
- through the deprecated Pipeline class.
8
-
9
- Example:
10
- >>> flow = Flow(llm_client)
11
- >>> flow = flow.get_flow_from_file("path/to/flow.yaml")
12
- >>> dataset = flow.generate(input_dataset)
13
-
14
- Note:
15
- This module is part of the SDG Hub package and is designed to work in conjunction
16
- with the SDG class for distributed data generation.
17
- """
18
-
19
- # SPDX-License-Identifier: Apache-2.0
20
- # Standard
21
- from abc import ABC
22
- from importlib import resources
23
- from typing import Any, Callable, Dict, List, Optional
24
- import operator
25
- import os
26
-
27
- # Third Party
28
- from datasets import Dataset
29
- from datasets.data_files import EmptyDatasetError
30
- from jinja2 import Environment, meta
31
- from rich.console import Console
32
- from rich.table import Table
33
- import yaml
34
-
35
- # Local
36
- from .blocks import * # needed to register blocks
37
- from .logger_config import setup_logger
38
- from .prompts import * # needed to register prompts
39
- from .registry import BlockRegistry, PromptRegistry
40
- from .utils.config_validation import validate_prompt_config_schema
41
- from .utils.path_resolution import resolve_path
42
- from .utils.validation_result import ValidationResult
43
-
44
- logger = setup_logger(__name__)
45
-
46
-
47
- OPERATOR_MAP: Dict[str, Callable] = {
48
- "operator.eq": operator.eq,
49
- "operator.ge": operator.ge,
50
- "operator.le": operator.le,
51
- "operator.gt": operator.gt,
52
- "operator.lt": operator.lt,
53
- "operator.ne": operator.ne,
54
- "operator.contains": operator.contains,
55
- }
56
-
57
- CONVERT_DTYPE_MAP: Dict[str, Callable] = {
58
- "float": float,
59
- "int": int,
60
- }
61
-
62
-
63
- class Flow(ABC):
64
- """A class representing a data generation flow.
65
-
66
- This class handles both configuration loading and execution of data generation
67
- blocks. It can be used directly with SDG or through the deprecated Pipeline class.
68
- """
69
-
70
- def __init__(
71
- self,
72
- llm_client: Any,
73
- num_samples_to_generate: Optional[int] = None,
74
- log_level: Optional[str] = None,
75
- ) -> None:
76
- """
77
- Initialize the Flow class.
78
-
79
- Parameters
80
- ----------
81
- llm_client : Any
82
- The LLM client to use for generation.
83
- num_samples_to_generate : Optional[int], optional
84
- Number of samples to generate, by default None
85
- log_level : Optional[str], optional
86
- Logging verbosity level, by default None
87
-
88
- Attributes
89
- ----------
90
- llm_client : Any
91
- The LLM client instance.
92
- base_path : str
93
- Base path for resource files.
94
- registered_blocks : Dict[str, Any]
95
- Registry of available blocks.
96
- chained_blocks : Optional[List[Dict[str, Any]]]
97
- List of block configurations.
98
- num_samples_to_generate : Optional[int]
99
- Number of samples to generate.
100
-
101
- """
102
- self.llm_client = llm_client
103
- self.base_path = str(resources.files(__package__))
104
- self.registered_blocks = BlockRegistry.get_registry()
105
- self.chained_blocks = None # Will be set by get_flow_from_file
106
- self.num_samples_to_generate = num_samples_to_generate
107
-
108
- # Logging verbosity level
109
- self.log_level = log_level or os.getenv("SDG_HUB_LOG_LEVEL", "normal").lower()
110
- self.console = Console() if self.log_level in ["verbose", "debug"] else None
111
-
112
- def _log_block_info(
113
- self, index: int, total: int, name: str, ds: Dataset, stage: str
114
- ) -> None:
115
- if self.log_level in ["verbose", "debug"] and self.console:
116
- table = Table(
117
- title=f"{stage} Block {index + 1}/{total}: {name}", show_header=True
118
- )
119
- table.add_column("Metric", style="cyan", no_wrap=True)
120
- table.add_column("Value", style="magenta")
121
- table.add_row("Rows", str(len(ds)))
122
- table.add_row("Columns", ", ".join(ds.column_names))
123
- self.console.print(table)
124
-
125
- def _getFilePath(self, dirs: List[str], filename: str) -> str:
126
- """Find a named configuration file.
127
-
128
- Files are checked in the following order:
129
- 1. Absolute path is always used
130
- 2. Checked relative to the directories in "dirs"
131
- 3. Relative to the current directory
132
-
133
- Parameters
134
- ----------
135
- dirs : List[str]
136
- Directories in which to search for the file.
137
- filename : str
138
- The path to the configuration file.
139
-
140
- Returns
141
- -------
142
- str
143
- Selected file path.
144
- """
145
- return resolve_path(filename, dirs)
146
-
147
- def _drop_duplicates(self, dataset: Dataset, cols: List[str]) -> Dataset:
148
- """Drop duplicates from the dataset based on the columns provided.
149
-
150
- Parameters
151
- ----------
152
- dataset : Dataset
153
- The input dataset.
154
- cols : List[str]
155
- Columns to consider for duplicate detection.
156
-
157
- Returns
158
- -------
159
- Dataset
160
- Dataset with duplicates removed.
161
- """
162
- df = dataset.to_pandas()
163
- df = df.drop_duplicates(subset=cols).reset_index(drop=True)
164
- return Dataset.from_pandas(df)
165
-
166
- def generate(self, dataset: Dataset) -> Dataset:
167
- """Generate the dataset by running the pipeline steps.
168
-
169
- Parameters
170
- ----------
171
- dataset : Dataset
172
- The input dataset to process.
173
-
174
- Returns
175
- -------
176
- Dataset
177
- The processed dataset.
178
-
179
- Raises
180
- ------
181
- ValueError
182
- If Flow has not been initialized with blocks.
183
- EmptyDatasetError
184
- If a block produces an empty dataset.
185
- """
186
- if self.chained_blocks is None:
187
- raise ValueError(
188
- "Flow has not been initialized with blocks. "
189
- "Call get_flow_from_file() first. "
190
- "Or pass a list of blocks to the Flow constructor."
191
- )
192
-
193
- for i, block_prop in enumerate(self.chained_blocks):
194
- block_type = block_prop["block_type"]
195
- block_config = block_prop["block_config"]
196
- drop_columns = block_prop.get("drop_columns", [])
197
- gen_kwargs = block_prop.get("gen_kwargs", {})
198
- drop_duplicates_cols = block_prop.get("drop_duplicates", False)
199
- block = block_type(**block_config)
200
-
201
- name = block_config.get("block_name", f"block_{i}")
202
-
203
- # Logging: always show basic progress unless in quiet mode
204
- if self.log_level in ["normal", "verbose", "debug"]:
205
- logger.info(
206
- f"🔄 Running block {i + 1}/{len(self.chained_blocks)}: {name}"
207
- )
208
-
209
- # Log dataset shape before block (verbose/debug)
210
- self._log_block_info(i, len(self.chained_blocks), name, dataset, "Input")
211
-
212
- if self.log_level == "debug":
213
- logger.debug(f"Input dataset (truncated): {dataset}")
214
-
215
- dataset = block.generate(dataset, **gen_kwargs)
216
-
217
- if len(dataset) == 0:
218
- raise EmptyDatasetError(
219
- f"Pipeline stopped: "
220
- f"Empty dataset after running block: "
221
- f"{block_config['block_name']}"
222
- )
223
-
224
- drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names]
225
- if drop_columns:
226
- dataset = dataset.remove_columns(drop_columns_in_ds)
227
-
228
- if drop_duplicates_cols:
229
- dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
230
-
231
- # Log dataset shape after block (verbose/debug)
232
- self._log_block_info(i, len(self.chained_blocks), name, dataset, "Output")
233
-
234
- if self.log_level == "debug":
235
- logger.debug(f"Output dataset (truncated): {dataset}")
236
-
237
- return dataset
238
-
239
- def validate_config_files(self) -> "ValidationResult":
240
- """
241
- Validate all configuration file paths referenced in the flow blocks.
242
-
243
- This method checks that all config files specified via `config_path` or `config_paths`
244
- in each block:
245
- - Exist on the filesystem
246
- - Are readable by the current process
247
- - Are valid YAML files (optional format check)
248
-
249
- Returns
250
- -------
251
- ValidationResult
252
- An object indicating whether all config files passed validation, along with a list
253
- of error messages for any missing, unreadable, or invalid YAML files.
254
-
255
- Notes
256
- -----
257
- This method is automatically called at the end of `get_flow_from_file()` to ensure
258
- early detection of misconfigured blocks.
259
- """
260
- errors = []
261
-
262
- def check_file(path: str, context: str):
263
- if not os.path.isfile(path):
264
- errors.append(f"[{context}] File does not exist: {path}")
265
- else:
266
- try:
267
- with open(path, "r", encoding="utf-8") as f:
268
- config_data = yaml.safe_load(f)
269
- _, validation_errors = validate_prompt_config_schema(
270
- config_data, path
271
- )
272
-
273
- if validation_errors:
274
- errors.extend(validation_errors)
275
-
276
- except PermissionError:
277
- errors.append(f"[{context}] File is not readable: {path}")
278
- except yaml.YAMLError as e:
279
- errors.append(f"[{context}] YAML load failed: {path} ({e})")
280
-
281
- for i, block in enumerate(self.chained_blocks or []):
282
- block_name = block["block_config"].get("block_name", f"block_{i}")
283
-
284
- config_path = block["block_config"].get("config_path")
285
- if config_path:
286
- check_file(config_path, f"{block_name}.config_path")
287
-
288
- config_paths = block["block_config"].get("config_paths")
289
- if isinstance(config_paths, list):
290
- for idx, path in enumerate(config_paths):
291
- check_file(path, f"{block_name}.config_paths[{idx}]")
292
- elif isinstance(config_paths, dict):
293
- for key, path in config_paths.items():
294
- check_file(path, f"{block_name}.config_paths['{key}']")
295
-
296
- return ValidationResult(valid=(len(errors) == 0), errors=errors)
297
-
298
- def get_flow_from_file(self, yaml_path: str) -> "Flow":
299
- """Load and initialize flow configuration from a YAML file.
300
-
301
- Parameters
302
- ----------
303
- yaml_path : str
304
- Path to the YAML configuration file.
305
-
306
- Returns
307
- -------
308
- Flow
309
- Self with initialized chained_blocks.
310
-
311
- Raises
312
- ------
313
- FileNotFoundError
314
- If the YAML file cannot be found.
315
- KeyError
316
- If a required block or prompt is not found in the registry.
317
- """
318
- yaml_path = resolve_path(yaml_path, self.base_path)
319
- yaml_dir = os.path.dirname(yaml_path)
320
-
321
- try:
322
- with open(yaml_path, "r", encoding="utf-8") as yaml_file:
323
- flow = yaml.safe_load(yaml_file)
324
- except FileNotFoundError as exc:
325
- raise FileNotFoundError(f"File not found: {yaml_path}") from exc
326
-
327
- # update config with class instances
328
- for block in flow:
329
- # check if theres an llm block in the flow
330
- if "LLM" in block["block_type"]:
331
- block["block_config"]["client"] = self.llm_client
332
- # model_id and prompt templates
333
- # try to get a template using the model_id, but if model_prompt_template is provided, use that
334
- if block["block_config"].get("model_prompt", None) is None:
335
- # try to find a match in the registry
336
- matched_prompt = next(
337
- (
338
- key
339
- for key in PromptRegistry.get_registry()
340
- if key in block["block_config"]["model_id"]
341
- ),
342
- None,
343
- )
344
- if matched_prompt is not None:
345
- block["block_config"]["model_prompt"] = matched_prompt
346
- else:
347
- raise KeyError(
348
- f"Prompt not found in registry: {block['block_config']['model_id']}"
349
- )
350
-
351
- if self.num_samples_to_generate is not None:
352
- block["num_samples"] = self.num_samples_to_generate
353
-
354
- # update block type to llm class instance
355
- try:
356
- block["block_type"] = self.registered_blocks[block["block_type"]]
357
- except KeyError as exc:
358
- raise KeyError(
359
- f"Block not found in registry: {block['block_type']}"
360
- ) from exc
361
-
362
- # update config path to absolute path
363
- if "config_path" in block["block_config"]:
364
- block["block_config"]["config_path"] = self._getFilePath(
365
- [yaml_dir, self.base_path], block["block_config"]["config_path"]
366
- )
367
-
368
- # update config paths to absolute paths - this might be a list or a dict
369
- if "config_paths" in block["block_config"]:
370
- if isinstance(block["block_config"]["config_paths"], dict):
371
- for key, path in block["block_config"]["config_paths"].items():
372
- block["block_config"]["config_paths"][key] = self._getFilePath(
373
- [yaml_dir, self.base_path], path
374
- )
375
-
376
- elif isinstance(block["block_config"]["config_paths"], list):
377
- for i, path in enumerate(block["block_config"]["config_paths"]):
378
- block["block_config"]["config_paths"][i] = self._getFilePath(
379
- [yaml_dir, self.base_path], path
380
- )
381
-
382
- if "operation" in block["block_config"]:
383
- block["block_config"]["operation"] = OPERATOR_MAP[
384
- block["block_config"]["operation"]
385
- ]
386
-
387
- if "convert_dtype" in block["block_config"]:
388
- block["block_config"]["convert_dtype"] = CONVERT_DTYPE_MAP[
389
- block["block_config"]["convert_dtype"]
390
- ]
391
-
392
- # Store the chained blocks and return self
393
- self.chained_blocks = flow
394
-
395
- # Validate config files
396
- result = self.validate_config_files()
397
- if not result.valid:
398
- raise ValueError("Invalid config files:\n\n".join(result.errors))
399
-
400
- return self
401
-
402
- def validate_flow(self, dataset: Dataset) -> "ValidationResult":
403
- """
404
- Validate that all required dataset columns are present before executing the flow.
405
-
406
- This includes:
407
- - Columns referenced in Jinja templates for LLM blocks
408
- - Columns required by specific utility blocks (e.g. filter_column, choice_col, etc.)
409
-
410
- Parameters
411
- ----------
412
- dataset : Dataset
413
- The input dataset to validate against.
414
-
415
- Returns
416
- -------
417
- ValidationResult
418
- Whether the dataset has all required columns, and which ones are missing.
419
- """
420
- errors = []
421
- all_columns = set(dataset.column_names)
422
-
423
- for i, block in enumerate(self.chained_blocks or []):
424
- name = block["block_config"].get("block_name", f"block_{i}")
425
- block_type = block["block_type"]
426
- config = block["block_config"]
427
-
428
- # LLM Block: parse Jinja vars
429
- cls_name = (
430
- block_type.__name__
431
- if isinstance(block_type, type)
432
- else block_type.__class__.__name__
433
- )
434
- logger.info(f"Validating block: {name} ({cls_name})")
435
- if "LLM" in cls_name:
436
- config_path = config.get("config_path")
437
- if config_path and os.path.isfile(config_path):
438
- with open(config_path, "r", encoding="utf-8") as f:
439
- content = f.read()
440
- env = Environment()
441
- ast = env.parse(content)
442
- vars_found = meta.find_undeclared_variables(ast)
443
- for var in vars_found:
444
- if var not in all_columns:
445
- errors.append(
446
- f"[{name}] Missing column for prompt var: '{var}'"
447
- )
448
-
449
- # FilterByValueBlock
450
- if "FilterByValueBlock" in str(block_type):
451
- col = config.get("filter_column")
452
- if col and col not in all_columns:
453
- errors.append(f"[{name}] Missing filter_column: '{col}'")
454
-
455
- # SelectorBlock
456
- if "SelectorBlock" in str(block_type):
457
- col = config.get("choice_col")
458
- if col and col not in all_columns:
459
- errors.append(f"[{name}] Missing choice_col: '{col}'")
460
-
461
- choice_map = config.get("choice_map", {})
462
- for col in choice_map.values():
463
- if col not in all_columns:
464
- errors.append(
465
- f"[{name}] choice_map references missing column: '{col}'"
466
- )
467
-
468
- # CombineColumnsBlock
469
- if "CombineColumnsBlock" in str(block_type):
470
- cols = config.get("columns", [])
471
- for col in cols:
472
- if col not in all_columns:
473
- errors.append(
474
- f"[{name}] CombineColumnsBlock requires column: '{col}'"
475
- )
476
-
477
- return ValidationResult(valid=(len(errors) == 0), errors=errors)