sdg-hub 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sdg_hub/__init__.py +28 -1
- sdg_hub/_version.py +2 -2
- sdg_hub/core/__init__.py +22 -0
- sdg_hub/core/blocks/__init__.py +58 -0
- sdg_hub/core/blocks/base.py +313 -0
- sdg_hub/core/blocks/deprecated_blocks/__init__.py +29 -0
- sdg_hub/core/blocks/deprecated_blocks/combine_columns.py +93 -0
- sdg_hub/core/blocks/deprecated_blocks/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/filter_by_value.py +103 -0
- sdg_hub/core/blocks/deprecated_blocks/flatten_columns.py +94 -0
- sdg_hub/core/blocks/deprecated_blocks/llmblock.py +479 -0
- sdg_hub/core/blocks/deprecated_blocks/rename_columns.py +88 -0
- sdg_hub/core/blocks/deprecated_blocks/sample_populator.py +58 -0
- sdg_hub/core/blocks/deprecated_blocks/selector.py +97 -0
- sdg_hub/core/blocks/deprecated_blocks/set_to_majority_value.py +88 -0
- sdg_hub/core/blocks/evaluation/__init__.py +9 -0
- sdg_hub/core/blocks/evaluation/evaluate_faithfulness_block.py +564 -0
- sdg_hub/core/blocks/evaluation/evaluate_relevancy_block.py +564 -0
- sdg_hub/core/blocks/evaluation/verify_question_block.py +564 -0
- sdg_hub/core/blocks/filtering/__init__.py +12 -0
- sdg_hub/core/blocks/filtering/column_value_filter.py +188 -0
- sdg_hub/core/blocks/llm/__init__.py +27 -0
- sdg_hub/core/blocks/llm/client_manager.py +398 -0
- sdg_hub/core/blocks/llm/config.py +336 -0
- sdg_hub/core/blocks/llm/error_handler.py +368 -0
- sdg_hub/core/blocks/llm/llm_chat_block.py +542 -0
- sdg_hub/core/blocks/llm/llm_chat_with_parsing_retry_block.py +491 -0
- sdg_hub/core/blocks/llm/prompt_builder_block.py +368 -0
- sdg_hub/core/blocks/llm/text_parser_block.py +357 -0
- sdg_hub/core/blocks/registry.py +331 -0
- sdg_hub/core/blocks/transform/__init__.py +23 -0
- sdg_hub/core/blocks/transform/duplicate_columns.py +88 -0
- sdg_hub/core/blocks/transform/index_based_mapper.py +225 -0
- sdg_hub/core/blocks/transform/melt_columns.py +126 -0
- sdg_hub/core/blocks/transform/rename_columns.py +69 -0
- sdg_hub/core/blocks/transform/text_concat.py +102 -0
- sdg_hub/core/blocks/transform/uniform_col_val_setter.py +101 -0
- sdg_hub/core/flow/__init__.py +20 -0
- sdg_hub/core/flow/base.py +1209 -0
- sdg_hub/core/flow/checkpointer.py +333 -0
- sdg_hub/core/flow/metadata.py +389 -0
- sdg_hub/core/flow/migration.py +198 -0
- sdg_hub/core/flow/registry.py +393 -0
- sdg_hub/core/flow/validation.py +277 -0
- sdg_hub/{utils → core/utils}/__init__.py +7 -4
- sdg_hub/core/utils/datautils.py +63 -0
- sdg_hub/core/utils/error_handling.py +208 -0
- sdg_hub/core/utils/flow_id_words.yaml +231 -0
- sdg_hub/core/utils/flow_identifier.py +94 -0
- sdg_hub/{utils → core/utils}/path_resolution.py +2 -2
- sdg_hub/core/utils/yaml_utils.py +59 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/atomic_facts.yaml +40 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/detailed_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_faithfulness.yaml +64 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_question.yaml +29 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/evaluate_relevancy.yaml +81 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/extractive_summary.yaml +13 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/flow.yaml +192 -0
- sdg_hub/flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/generate_questions_responses.yaml +54 -0
- sdg_hub-0.2.1.dist-info/METADATA +221 -0
- sdg_hub-0.2.1.dist-info/RECORD +68 -0
- sdg_hub/blocks/__init__.py +0 -42
- sdg_hub/blocks/block.py +0 -96
- sdg_hub/blocks/llmblock.py +0 -375
- sdg_hub/blocks/openaichatblock.py +0 -556
- sdg_hub/blocks/utilblocks.py +0 -597
- sdg_hub/checkpointer.py +0 -139
- sdg_hub/configs/annotations/cot_reflection.yaml +0 -34
- sdg_hub/configs/annotations/detailed_annotations.yaml +0 -28
- sdg_hub/configs/annotations/detailed_description.yaml +0 -10
- sdg_hub/configs/annotations/detailed_description_icl.yaml +0 -32
- sdg_hub/configs/annotations/simple_annotations.yaml +0 -9
- sdg_hub/configs/knowledge/__init__.py +0 -0
- sdg_hub/configs/knowledge/atomic_facts.yaml +0 -46
- sdg_hub/configs/knowledge/auxilary_instructions.yaml +0 -35
- sdg_hub/configs/knowledge/detailed_summary.yaml +0 -18
- sdg_hub/configs/knowledge/evaluate_faithfulness.yaml +0 -68
- sdg_hub/configs/knowledge/evaluate_question.yaml +0 -38
- sdg_hub/configs/knowledge/evaluate_relevancy.yaml +0 -84
- sdg_hub/configs/knowledge/extractive_summary.yaml +0 -18
- sdg_hub/configs/knowledge/generate_code_questions_responses.yaml +0 -39
- sdg_hub/configs/knowledge/generate_questions.yaml +0 -82
- sdg_hub/configs/knowledge/generate_questions_responses.yaml +0 -56
- sdg_hub/configs/knowledge/generate_responses.yaml +0 -86
- sdg_hub/configs/knowledge/mcq_generation.yaml +0 -83
- sdg_hub/configs/knowledge/router.yaml +0 -12
- sdg_hub/configs/knowledge/simple_generate_qa.yaml +0 -34
- sdg_hub/configs/reasoning/__init__.py +0 -0
- sdg_hub/configs/reasoning/dynamic_cot.yaml +0 -40
- sdg_hub/configs/skills/__init__.py +0 -0
- sdg_hub/configs/skills/analyzer.yaml +0 -48
- sdg_hub/configs/skills/annotation.yaml +0 -36
- sdg_hub/configs/skills/contexts.yaml +0 -28
- sdg_hub/configs/skills/critic.yaml +0 -60
- sdg_hub/configs/skills/evaluate_freeform_pair.yaml +0 -111
- sdg_hub/configs/skills/evaluate_freeform_questions.yaml +0 -78
- sdg_hub/configs/skills/evaluate_grounded_pair.yaml +0 -119
- sdg_hub/configs/skills/evaluate_grounded_questions.yaml +0 -51
- sdg_hub/configs/skills/freeform_questions.yaml +0 -34
- sdg_hub/configs/skills/freeform_responses.yaml +0 -39
- sdg_hub/configs/skills/grounded_questions.yaml +0 -38
- sdg_hub/configs/skills/grounded_responses.yaml +0 -59
- sdg_hub/configs/skills/icl_examples/STEM.yaml +0 -56
- sdg_hub/configs/skills/icl_examples/__init__.py +0 -0
- sdg_hub/configs/skills/icl_examples/coding.yaml +0 -97
- sdg_hub/configs/skills/icl_examples/extraction.yaml +0 -36
- sdg_hub/configs/skills/icl_examples/humanities.yaml +0 -71
- sdg_hub/configs/skills/icl_examples/math.yaml +0 -85
- sdg_hub/configs/skills/icl_examples/reasoning.yaml +0 -30
- sdg_hub/configs/skills/icl_examples/roleplay.yaml +0 -45
- sdg_hub/configs/skills/icl_examples/writing.yaml +0 -80
- sdg_hub/configs/skills/judge.yaml +0 -53
- sdg_hub/configs/skills/planner.yaml +0 -67
- sdg_hub/configs/skills/respond.yaml +0 -8
- sdg_hub/configs/skills/revised_responder.yaml +0 -78
- sdg_hub/configs/skills/router.yaml +0 -59
- sdg_hub/configs/skills/simple_generate_qa_freeform.yaml +0 -27
- sdg_hub/configs/skills/simple_generate_qa_grounded.yaml +0 -31
- sdg_hub/flow.py +0 -477
- sdg_hub/flow_runner.py +0 -450
- sdg_hub/flows/generation/knowledge/mmlu_bench.yaml +0 -13
- sdg_hub/flows/generation/knowledge/simple_knowledge.yaml +0 -12
- sdg_hub/flows/generation/knowledge/synth_knowledge.yaml +0 -89
- sdg_hub/flows/generation/knowledge/synth_knowledge1.5.yaml +0 -136
- sdg_hub/flows/generation/skills/improve_responses.yaml +0 -103
- sdg_hub/flows/generation/skills/simple_freeform_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/simple_grounded_skill.yaml +0 -12
- sdg_hub/flows/generation/skills/synth_grounded_skills.yaml +0 -80
- sdg_hub/flows/generation/skills/synth_skills.yaml +0 -59
- sdg_hub/pipeline.py +0 -121
- sdg_hub/prompts.py +0 -80
- sdg_hub/registry.py +0 -122
- sdg_hub/sdg.py +0 -206
- sdg_hub/utils/config_validation.py +0 -91
- sdg_hub/utils/datautils.py +0 -14
- sdg_hub/utils/error_handling.py +0 -94
- sdg_hub/utils/validation_result.py +0 -10
- sdg_hub-0.1.4.dist-info/METADATA +0 -190
- sdg_hub-0.1.4.dist-info/RECORD +0 -89
- sdg_hub/{logger_config.py → core/utils/logger_config.py} +1 -1
- /sdg_hub/{configs/__init__.py → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab/README.md} +0 -0
- /sdg_hub/{configs/annotations → flows/qa_generation/document_grounded_qa/multi_summary_qa/instructlab}/__init__.py +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/WHEEL +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {sdg_hub-0.1.4.dist-info → sdg_hub-0.2.1.dist-info}/top_level.txt +0 -0
@@ -1,78 +0,0 @@
|
|
1
|
-
system: |
|
2
|
-
You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
|
3
|
-
|
4
|
-
introduction: |
|
5
|
-
Your task is to revise the response to an user query. You will be given a query from an user and a response from a model. Your task is to provide a better response based on the plan given.
|
6
|
-
|
7
|
-
principles: |
|
8
|
-
* You will revise the model response according to the plan given.
|
9
|
-
* The revised response should adhere to the plan and should be better than the original response.
|
10
|
-
* Note that the revised response will be evaluated by a human expert and should thus be of high quality.
|
11
|
-
* Do not have any irrelevant information in the revised response. Specifically do not include any self-referential information in the revised response.
|
12
|
-
* Your response should only include the revised response. Please do not include any other information like the query, analysis, rubric, etc.
|
13
|
-
* Your response will become invalid if it contains any meta-review about how you are revising the response. So please avoid including any such information.
|
14
|
-
* If the plan mentions that there is no need to provide a plan for improvement, simply return the original response as the revised response.
|
15
|
-
* Return the revised response between [Start of Revised Response] and [End of Revised Response] tags.
|
16
|
-
|
17
|
-
examples: |
|
18
|
-
To help you understand the task, here is an example:
|
19
|
-
|
20
|
-
[Start of Query]
|
21
|
-
{{ icl_query }}
|
22
|
-
[End of Query]
|
23
|
-
|
24
|
-
[Start of Response]
|
25
|
-
{{ icl_response }}
|
26
|
-
[End of Response]
|
27
|
-
|
28
|
-
[Start of Analysis]
|
29
|
-
{{ icl_analysis }}
|
30
|
-
[End of Analysis]
|
31
|
-
|
32
|
-
[Start of Rubric]
|
33
|
-
{{ icl_rubric }}
|
34
|
-
[End of Rubric]
|
35
|
-
|
36
|
-
[Start of Critique]
|
37
|
-
{{ icl_critique }}
|
38
|
-
[End of Critique]
|
39
|
-
|
40
|
-
[Start of Plan]
|
41
|
-
{{ icl_plan }}
|
42
|
-
[End of Plan]
|
43
|
-
|
44
|
-
[Start of Revised Response]
|
45
|
-
{{ icl_revised_response }}
|
46
|
-
[End of Revised Response]
|
47
|
-
|
48
|
-
generation: |
|
49
|
-
Now it's your turn to revise the response to the following query. Remember to follow the paradigm and return the revised response in the respective section in the same format as above. Strictly do not include any meta-review or meta-information about how the response was improved or revised. Your response should only include the revised response. You will be heavily penalized if you include any information about the revision process or if you have any reference about how you revised the response.
|
50
|
-
|
51
|
-
[Start of Query]
|
52
|
-
{{ question }}
|
53
|
-
[End of Query]
|
54
|
-
|
55
|
-
[Start of Response]
|
56
|
-
{{ response }}
|
57
|
-
[End of Response]
|
58
|
-
|
59
|
-
[Start of Analysis]
|
60
|
-
{{ analysis }}
|
61
|
-
[End of Analysis]
|
62
|
-
|
63
|
-
[Start of Rubric]
|
64
|
-
{{ rubric }}
|
65
|
-
[End of Rubric]
|
66
|
-
|
67
|
-
[Start of Critique]
|
68
|
-
{{ critique }}
|
69
|
-
[End of Critique]
|
70
|
-
|
71
|
-
[Start of Plan]
|
72
|
-
{{ plan }}
|
73
|
-
[End of Plan]
|
74
|
-
|
75
|
-
Start your response with the tag [Start of Revised Response] and end it with the tag [End of Revised Response].
|
76
|
-
|
77
|
-
start_tags: ["[Start of Revised Response]"]
|
78
|
-
end_tags: ["[End of Revised Response]"]
|
@@ -1,59 +0,0 @@
|
|
1
|
-
system: |
|
2
|
-
You are a skill classifier. Given a question or task, classify it into exactly one of the following categories:
|
3
|
-
- coding: Questions about programming, software development, algorithms, or technical implementation.
|
4
|
-
- extraction: Tasks that require identifying and pulling out specific pieces of information from text or data, without analysis or transformation. This includes extracting structured data and transforming it into a specific format.
|
5
|
-
- humanities: Questions about economics, social sciences, literature, history, philosophy, or arts that require analysis, comparison, or evaluation.
|
6
|
-
- math: Questions requiring mathematical calculations, proofs, probability, or numerical reasoning.
|
7
|
-
- reasoning: Tasks requiring logical deduction, problem-solving, or analytical thinking.
|
8
|
-
- roleplay: Scenarios requiring adopting a specific role, perspective, or character.
|
9
|
-
- stem: Questions about science, technology, engineering, or mathematics that require explaining technical or scientific concepts.
|
10
|
-
- writing: Tasks that require creating new content, compositions, or text generation.
|
11
|
-
|
12
|
-
Only output the category name, and nothing else. Please do not output anything except one of these exact words.
|
13
|
-
|
14
|
-
introduction: |
|
15
|
-
Classify the given question into one of the predefined categories based on its primary focus and requirements.
|
16
|
-
|
17
|
-
principles: |
|
18
|
-
1. Each question should be classified into exactly one category.
|
19
|
-
2. Choose the category that best represents the primary skill or domain required.
|
20
|
-
3. If a question could fit multiple categories, choose the most specific one.
|
21
|
-
4. Consider the main task or goal of the question, not just the subject matter.
|
22
|
-
5. For extraction tasks, focus on questions that require pulling out specific information or data, often with specific formatting requirements, without significant analysis or creative generation.
|
23
|
-
6. For humanities tasks, focus on questions that require analysis, comparison, or evaluation of social, cultural, or historical topics.
|
24
|
-
7. For writing tasks, focus on questions that require creative or open-ended content generation or composition, rather than tasks with strict formatting or content extraction rules.
|
25
|
-
|
26
|
-
examples: |
|
27
|
-
Question: Extract and categorize the health-related themes, such as 'symptoms', 'treatments', and 'prevention'
|
28
|
-
Category: extraction
|
29
|
-
|
30
|
-
Question: Summarize the story with three bullet points using only nouns and adjectives, without verbs.
|
31
|
-
Category: extraction
|
32
|
-
|
33
|
-
Question: Can you analyze the relationship between economic indicators like GDP, interest rates, and consumer behavior?
|
34
|
-
Category: humanities
|
35
|
-
|
36
|
-
Question: Calculate the area of a triangle with base 6 units and height 8 units
|
37
|
-
Category: math
|
38
|
-
|
39
|
-
Question: Suppose you are in a marathon and you just passed the person in third place. What position are you in?
|
40
|
-
Category: reasoning
|
41
|
-
|
42
|
-
Question: You are a medieval blacksmith. Describe your typical workday and the challenges you face
|
43
|
-
Category: roleplay
|
44
|
-
|
45
|
-
Question: In the realm of quantum mechanics, what is wave-particle duality, and how does it contribute to the understanding of quantum systems?
|
46
|
-
Category: STEM
|
47
|
-
|
48
|
-
Question: Write a travel blog post about exploring the ancient temples and street food scene in Bangkok, Thailand
|
49
|
-
Category: writing
|
50
|
-
|
51
|
-
generation: |
|
52
|
-
Question: {{ question }}
|
53
|
-
|
54
|
-
Based on the above question, classify it into exactly one of these categories: coding, extraction, humanities, math, reasoning, roleplay, STEM, or writing.
|
55
|
-
|
56
|
-
Category:
|
57
|
-
|
58
|
-
start_tags: [""]
|
59
|
-
end_tags: [""]
|
@@ -1,27 +0,0 @@
|
|
1
|
-
system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
|
2
|
-
|
3
|
-
introduction: Develop a series of question and answer pairs to perform a task.
|
4
|
-
|
5
|
-
principles: |
|
6
|
-
Here are the requirements:
|
7
|
-
1. Try not to repeat the verb for each instruction to maximize diversity.
|
8
|
-
2. The language used for the instruction also should be diverse. For example, you should combine questions with imperative instructions.
|
9
|
-
3. The type of instructions should be similar to provided examples. The generated instruction and the output should be grounded in the provided document.
|
10
|
-
4. A GPT language model should be able to complete the instruction. For example, do not ask the assistant to create any visual or audio output. For another example, do not ask the assistant to wake you up at 5pm or set a reminder because it cannot perform any action.
|
11
|
-
5. The instructions should be in English.
|
12
|
-
6. The instructions should be 1 to 2 sentences long. Either an imperative sentence or a question is permitted.
|
13
|
-
7. The output should be an appropriate response to the input and the instruction. Long outputs are preferable.
|
14
|
-
|
15
|
-
examples: |
|
16
|
-
The task is {{task_description}}.
|
17
|
-
|
18
|
-
Here is an example to help you understand the type of questions that are asked for:
|
19
|
-
|
20
|
-
{{seed_question}}
|
21
|
-
{{seed_response}}
|
22
|
-
|
23
|
-
generation: |
|
24
|
-
Provide a single question and answer pair based on the examples.
|
25
|
-
|
26
|
-
start_tags: [""]
|
27
|
-
end_tags: [""]
|
@@ -1,31 +0,0 @@
|
|
1
|
-
system: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
|
2
|
-
|
3
|
-
introduction: Develop a series of question and answer pairs to perform a task.
|
4
|
-
|
5
|
-
principles: |
|
6
|
-
Here are the requirements:
|
7
|
-
1. Try not to repeat the verb for each instruction to maximize diversity.
|
8
|
-
2. The language used for the instruction also should be diverse. For example, you should combine questions with imperative instructions.
|
9
|
-
3. The type of instructions should be similar to provided examples. The generated instruction and the output should be grounded in the provided document.
|
10
|
-
4. A GPT language model should be able to complete the instruction. For example, do not ask the assistant to create any visual or audio output. For another example, do not ask the assistant to wake you up at 5pm or set a reminder because it cannot perform any action.
|
11
|
-
5. The instructions should be in English.
|
12
|
-
6. The instructions should be 1 to 2 sentences long. Either an imperative sentence or a question is permitted.
|
13
|
-
7. The output should be an appropriate response to the input and the instruction. Long outputs are preferable.
|
14
|
-
|
15
|
-
examples: |
|
16
|
-
The task is {{task_description}}.
|
17
|
-
|
18
|
-
Here is some context for the example question:
|
19
|
-
|
20
|
-
{{seed_context}}
|
21
|
-
|
22
|
-
Here is an example to help you understand the type of questions that are asked for:
|
23
|
-
|
24
|
-
{{seed_question}}
|
25
|
-
{{seed_response}}
|
26
|
-
|
27
|
-
generation: |
|
28
|
-
Provide a single question and answer pair based on the example.
|
29
|
-
|
30
|
-
start_tags: [""]
|
31
|
-
end_tags: [""]
|
sdg_hub/flow.py
DELETED
@@ -1,477 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Flow module for managing data generation pipelines.
|
3
|
-
|
4
|
-
This module provides the core Flow class that handles both configuration loading and execution
|
5
|
-
of data generation blocks. The Flow class serves as the main interface for defining and running
|
6
|
-
data generation pipelines, supporting both direct usage with SDG and backward compatibility
|
7
|
-
through the deprecated Pipeline class.
|
8
|
-
|
9
|
-
Example:
|
10
|
-
>>> flow = Flow(llm_client)
|
11
|
-
>>> flow = flow.get_flow_from_file("path/to/flow.yaml")
|
12
|
-
>>> dataset = flow.generate(input_dataset)
|
13
|
-
|
14
|
-
Note:
|
15
|
-
This module is part of the SDG Hub package and is designed to work in conjunction
|
16
|
-
with the SDG class for distributed data generation.
|
17
|
-
"""
|
18
|
-
|
19
|
-
# SPDX-License-Identifier: Apache-2.0
|
20
|
-
# Standard
|
21
|
-
from abc import ABC
|
22
|
-
from importlib import resources
|
23
|
-
from typing import Any, Callable, Dict, List, Optional
|
24
|
-
import operator
|
25
|
-
import os
|
26
|
-
|
27
|
-
# Third Party
|
28
|
-
from datasets import Dataset
|
29
|
-
from datasets.data_files import EmptyDatasetError
|
30
|
-
from jinja2 import Environment, meta
|
31
|
-
from rich.console import Console
|
32
|
-
from rich.table import Table
|
33
|
-
import yaml
|
34
|
-
|
35
|
-
# Local
|
36
|
-
from .blocks import * # needed to register blocks
|
37
|
-
from .logger_config import setup_logger
|
38
|
-
from .prompts import * # needed to register prompts
|
39
|
-
from .registry import BlockRegistry, PromptRegistry
|
40
|
-
from .utils.config_validation import validate_prompt_config_schema
|
41
|
-
from .utils.path_resolution import resolve_path
|
42
|
-
from .utils.validation_result import ValidationResult
|
43
|
-
|
44
|
-
logger = setup_logger(__name__)
|
45
|
-
|
46
|
-
|
47
|
-
OPERATOR_MAP: Dict[str, Callable] = {
|
48
|
-
"operator.eq": operator.eq,
|
49
|
-
"operator.ge": operator.ge,
|
50
|
-
"operator.le": operator.le,
|
51
|
-
"operator.gt": operator.gt,
|
52
|
-
"operator.lt": operator.lt,
|
53
|
-
"operator.ne": operator.ne,
|
54
|
-
"operator.contains": operator.contains,
|
55
|
-
}
|
56
|
-
|
57
|
-
CONVERT_DTYPE_MAP: Dict[str, Callable] = {
|
58
|
-
"float": float,
|
59
|
-
"int": int,
|
60
|
-
}
|
61
|
-
|
62
|
-
|
63
|
-
class Flow(ABC):
|
64
|
-
"""A class representing a data generation flow.
|
65
|
-
|
66
|
-
This class handles both configuration loading and execution of data generation
|
67
|
-
blocks. It can be used directly with SDG or through the deprecated Pipeline class.
|
68
|
-
"""
|
69
|
-
|
70
|
-
def __init__(
|
71
|
-
self,
|
72
|
-
llm_client: Any,
|
73
|
-
num_samples_to_generate: Optional[int] = None,
|
74
|
-
log_level: Optional[str] = None,
|
75
|
-
) -> None:
|
76
|
-
"""
|
77
|
-
Initialize the Flow class.
|
78
|
-
|
79
|
-
Parameters
|
80
|
-
----------
|
81
|
-
llm_client : Any
|
82
|
-
The LLM client to use for generation.
|
83
|
-
num_samples_to_generate : Optional[int], optional
|
84
|
-
Number of samples to generate, by default None
|
85
|
-
log_level : Optional[str], optional
|
86
|
-
Logging verbosity level, by default None
|
87
|
-
|
88
|
-
Attributes
|
89
|
-
----------
|
90
|
-
llm_client : Any
|
91
|
-
The LLM client instance.
|
92
|
-
base_path : str
|
93
|
-
Base path for resource files.
|
94
|
-
registered_blocks : Dict[str, Any]
|
95
|
-
Registry of available blocks.
|
96
|
-
chained_blocks : Optional[List[Dict[str, Any]]]
|
97
|
-
List of block configurations.
|
98
|
-
num_samples_to_generate : Optional[int]
|
99
|
-
Number of samples to generate.
|
100
|
-
|
101
|
-
"""
|
102
|
-
self.llm_client = llm_client
|
103
|
-
self.base_path = str(resources.files(__package__))
|
104
|
-
self.registered_blocks = BlockRegistry.get_registry()
|
105
|
-
self.chained_blocks = None # Will be set by get_flow_from_file
|
106
|
-
self.num_samples_to_generate = num_samples_to_generate
|
107
|
-
|
108
|
-
# Logging verbosity level
|
109
|
-
self.log_level = log_level or os.getenv("SDG_HUB_LOG_LEVEL", "normal").lower()
|
110
|
-
self.console = Console() if self.log_level in ["verbose", "debug"] else None
|
111
|
-
|
112
|
-
def _log_block_info(
|
113
|
-
self, index: int, total: int, name: str, ds: Dataset, stage: str
|
114
|
-
) -> None:
|
115
|
-
if self.log_level in ["verbose", "debug"] and self.console:
|
116
|
-
table = Table(
|
117
|
-
title=f"{stage} Block {index + 1}/{total}: {name}", show_header=True
|
118
|
-
)
|
119
|
-
table.add_column("Metric", style="cyan", no_wrap=True)
|
120
|
-
table.add_column("Value", style="magenta")
|
121
|
-
table.add_row("Rows", str(len(ds)))
|
122
|
-
table.add_row("Columns", ", ".join(ds.column_names))
|
123
|
-
self.console.print(table)
|
124
|
-
|
125
|
-
def _getFilePath(self, dirs: List[str], filename: str) -> str:
|
126
|
-
"""Find a named configuration file.
|
127
|
-
|
128
|
-
Files are checked in the following order:
|
129
|
-
1. Absolute path is always used
|
130
|
-
2. Checked relative to the directories in "dirs"
|
131
|
-
3. Relative to the current directory
|
132
|
-
|
133
|
-
Parameters
|
134
|
-
----------
|
135
|
-
dirs : List[str]
|
136
|
-
Directories in which to search for the file.
|
137
|
-
filename : str
|
138
|
-
The path to the configuration file.
|
139
|
-
|
140
|
-
Returns
|
141
|
-
-------
|
142
|
-
str
|
143
|
-
Selected file path.
|
144
|
-
"""
|
145
|
-
return resolve_path(filename, dirs)
|
146
|
-
|
147
|
-
def _drop_duplicates(self, dataset: Dataset, cols: List[str]) -> Dataset:
|
148
|
-
"""Drop duplicates from the dataset based on the columns provided.
|
149
|
-
|
150
|
-
Parameters
|
151
|
-
----------
|
152
|
-
dataset : Dataset
|
153
|
-
The input dataset.
|
154
|
-
cols : List[str]
|
155
|
-
Columns to consider for duplicate detection.
|
156
|
-
|
157
|
-
Returns
|
158
|
-
-------
|
159
|
-
Dataset
|
160
|
-
Dataset with duplicates removed.
|
161
|
-
"""
|
162
|
-
df = dataset.to_pandas()
|
163
|
-
df = df.drop_duplicates(subset=cols).reset_index(drop=True)
|
164
|
-
return Dataset.from_pandas(df)
|
165
|
-
|
166
|
-
def generate(self, dataset: Dataset) -> Dataset:
|
167
|
-
"""Generate the dataset by running the pipeline steps.
|
168
|
-
|
169
|
-
Parameters
|
170
|
-
----------
|
171
|
-
dataset : Dataset
|
172
|
-
The input dataset to process.
|
173
|
-
|
174
|
-
Returns
|
175
|
-
-------
|
176
|
-
Dataset
|
177
|
-
The processed dataset.
|
178
|
-
|
179
|
-
Raises
|
180
|
-
------
|
181
|
-
ValueError
|
182
|
-
If Flow has not been initialized with blocks.
|
183
|
-
EmptyDatasetError
|
184
|
-
If a block produces an empty dataset.
|
185
|
-
"""
|
186
|
-
if self.chained_blocks is None:
|
187
|
-
raise ValueError(
|
188
|
-
"Flow has not been initialized with blocks. "
|
189
|
-
"Call get_flow_from_file() first. "
|
190
|
-
"Or pass a list of blocks to the Flow constructor."
|
191
|
-
)
|
192
|
-
|
193
|
-
for i, block_prop in enumerate(self.chained_blocks):
|
194
|
-
block_type = block_prop["block_type"]
|
195
|
-
block_config = block_prop["block_config"]
|
196
|
-
drop_columns = block_prop.get("drop_columns", [])
|
197
|
-
gen_kwargs = block_prop.get("gen_kwargs", {})
|
198
|
-
drop_duplicates_cols = block_prop.get("drop_duplicates", False)
|
199
|
-
block = block_type(**block_config)
|
200
|
-
|
201
|
-
name = block_config.get("block_name", f"block_{i}")
|
202
|
-
|
203
|
-
# Logging: always show basic progress unless in quiet mode
|
204
|
-
if self.log_level in ["normal", "verbose", "debug"]:
|
205
|
-
logger.info(
|
206
|
-
f"🔄 Running block {i + 1}/{len(self.chained_blocks)}: {name}"
|
207
|
-
)
|
208
|
-
|
209
|
-
# Log dataset shape before block (verbose/debug)
|
210
|
-
self._log_block_info(i, len(self.chained_blocks), name, dataset, "Input")
|
211
|
-
|
212
|
-
if self.log_level == "debug":
|
213
|
-
logger.debug(f"Input dataset (truncated): {dataset}")
|
214
|
-
|
215
|
-
dataset = block.generate(dataset, **gen_kwargs)
|
216
|
-
|
217
|
-
if len(dataset) == 0:
|
218
|
-
raise EmptyDatasetError(
|
219
|
-
f"Pipeline stopped: "
|
220
|
-
f"Empty dataset after running block: "
|
221
|
-
f"{block_config['block_name']}"
|
222
|
-
)
|
223
|
-
|
224
|
-
drop_columns_in_ds = [e for e in drop_columns if e in dataset.column_names]
|
225
|
-
if drop_columns:
|
226
|
-
dataset = dataset.remove_columns(drop_columns_in_ds)
|
227
|
-
|
228
|
-
if drop_duplicates_cols:
|
229
|
-
dataset = self._drop_duplicates(dataset, cols=drop_duplicates_cols)
|
230
|
-
|
231
|
-
# Log dataset shape after block (verbose/debug)
|
232
|
-
self._log_block_info(i, len(self.chained_blocks), name, dataset, "Output")
|
233
|
-
|
234
|
-
if self.log_level == "debug":
|
235
|
-
logger.debug(f"Output dataset (truncated): {dataset}")
|
236
|
-
|
237
|
-
return dataset
|
238
|
-
|
239
|
-
def validate_config_files(self) -> "ValidationResult":
|
240
|
-
"""
|
241
|
-
Validate all configuration file paths referenced in the flow blocks.
|
242
|
-
|
243
|
-
This method checks that all config files specified via `config_path` or `config_paths`
|
244
|
-
in each block:
|
245
|
-
- Exist on the filesystem
|
246
|
-
- Are readable by the current process
|
247
|
-
- Are valid YAML files (optional format check)
|
248
|
-
|
249
|
-
Returns
|
250
|
-
-------
|
251
|
-
ValidationResult
|
252
|
-
An object indicating whether all config files passed validation, along with a list
|
253
|
-
of error messages for any missing, unreadable, or invalid YAML files.
|
254
|
-
|
255
|
-
Notes
|
256
|
-
-----
|
257
|
-
This method is automatically called at the end of `get_flow_from_file()` to ensure
|
258
|
-
early detection of misconfigured blocks.
|
259
|
-
"""
|
260
|
-
errors = []
|
261
|
-
|
262
|
-
def check_file(path: str, context: str):
|
263
|
-
if not os.path.isfile(path):
|
264
|
-
errors.append(f"[{context}] File does not exist: {path}")
|
265
|
-
else:
|
266
|
-
try:
|
267
|
-
with open(path, "r", encoding="utf-8") as f:
|
268
|
-
config_data = yaml.safe_load(f)
|
269
|
-
_, validation_errors = validate_prompt_config_schema(
|
270
|
-
config_data, path
|
271
|
-
)
|
272
|
-
|
273
|
-
if validation_errors:
|
274
|
-
errors.extend(validation_errors)
|
275
|
-
|
276
|
-
except PermissionError:
|
277
|
-
errors.append(f"[{context}] File is not readable: {path}")
|
278
|
-
except yaml.YAMLError as e:
|
279
|
-
errors.append(f"[{context}] YAML load failed: {path} ({e})")
|
280
|
-
|
281
|
-
for i, block in enumerate(self.chained_blocks or []):
|
282
|
-
block_name = block["block_config"].get("block_name", f"block_{i}")
|
283
|
-
|
284
|
-
config_path = block["block_config"].get("config_path")
|
285
|
-
if config_path:
|
286
|
-
check_file(config_path, f"{block_name}.config_path")
|
287
|
-
|
288
|
-
config_paths = block["block_config"].get("config_paths")
|
289
|
-
if isinstance(config_paths, list):
|
290
|
-
for idx, path in enumerate(config_paths):
|
291
|
-
check_file(path, f"{block_name}.config_paths[{idx}]")
|
292
|
-
elif isinstance(config_paths, dict):
|
293
|
-
for key, path in config_paths.items():
|
294
|
-
check_file(path, f"{block_name}.config_paths['{key}']")
|
295
|
-
|
296
|
-
return ValidationResult(valid=(len(errors) == 0), errors=errors)
|
297
|
-
|
298
|
-
def get_flow_from_file(self, yaml_path: str) -> "Flow":
|
299
|
-
"""Load and initialize flow configuration from a YAML file.
|
300
|
-
|
301
|
-
Parameters
|
302
|
-
----------
|
303
|
-
yaml_path : str
|
304
|
-
Path to the YAML configuration file.
|
305
|
-
|
306
|
-
Returns
|
307
|
-
-------
|
308
|
-
Flow
|
309
|
-
Self with initialized chained_blocks.
|
310
|
-
|
311
|
-
Raises
|
312
|
-
------
|
313
|
-
FileNotFoundError
|
314
|
-
If the YAML file cannot be found.
|
315
|
-
KeyError
|
316
|
-
If a required block or prompt is not found in the registry.
|
317
|
-
"""
|
318
|
-
yaml_path = resolve_path(yaml_path, self.base_path)
|
319
|
-
yaml_dir = os.path.dirname(yaml_path)
|
320
|
-
|
321
|
-
try:
|
322
|
-
with open(yaml_path, "r", encoding="utf-8") as yaml_file:
|
323
|
-
flow = yaml.safe_load(yaml_file)
|
324
|
-
except FileNotFoundError as exc:
|
325
|
-
raise FileNotFoundError(f"File not found: {yaml_path}") from exc
|
326
|
-
|
327
|
-
# update config with class instances
|
328
|
-
for block in flow:
|
329
|
-
# check if theres an llm block in the flow
|
330
|
-
if "LLM" in block["block_type"]:
|
331
|
-
block["block_config"]["client"] = self.llm_client
|
332
|
-
# model_id and prompt templates
|
333
|
-
# try to get a template using the model_id, but if model_prompt_template is provided, use that
|
334
|
-
if block["block_config"].get("model_prompt", None) is None:
|
335
|
-
# try to find a match in the registry
|
336
|
-
matched_prompt = next(
|
337
|
-
(
|
338
|
-
key
|
339
|
-
for key in PromptRegistry.get_registry()
|
340
|
-
if key in block["block_config"]["model_id"]
|
341
|
-
),
|
342
|
-
None,
|
343
|
-
)
|
344
|
-
if matched_prompt is not None:
|
345
|
-
block["block_config"]["model_prompt"] = matched_prompt
|
346
|
-
else:
|
347
|
-
raise KeyError(
|
348
|
-
f"Prompt not found in registry: {block['block_config']['model_id']}"
|
349
|
-
)
|
350
|
-
|
351
|
-
if self.num_samples_to_generate is not None:
|
352
|
-
block["num_samples"] = self.num_samples_to_generate
|
353
|
-
|
354
|
-
# update block type to llm class instance
|
355
|
-
try:
|
356
|
-
block["block_type"] = self.registered_blocks[block["block_type"]]
|
357
|
-
except KeyError as exc:
|
358
|
-
raise KeyError(
|
359
|
-
f"Block not found in registry: {block['block_type']}"
|
360
|
-
) from exc
|
361
|
-
|
362
|
-
# update config path to absolute path
|
363
|
-
if "config_path" in block["block_config"]:
|
364
|
-
block["block_config"]["config_path"] = self._getFilePath(
|
365
|
-
[yaml_dir, self.base_path], block["block_config"]["config_path"]
|
366
|
-
)
|
367
|
-
|
368
|
-
# update config paths to absolute paths - this might be a list or a dict
|
369
|
-
if "config_paths" in block["block_config"]:
|
370
|
-
if isinstance(block["block_config"]["config_paths"], dict):
|
371
|
-
for key, path in block["block_config"]["config_paths"].items():
|
372
|
-
block["block_config"]["config_paths"][key] = self._getFilePath(
|
373
|
-
[yaml_dir, self.base_path], path
|
374
|
-
)
|
375
|
-
|
376
|
-
elif isinstance(block["block_config"]["config_paths"], list):
|
377
|
-
for i, path in enumerate(block["block_config"]["config_paths"]):
|
378
|
-
block["block_config"]["config_paths"][i] = self._getFilePath(
|
379
|
-
[yaml_dir, self.base_path], path
|
380
|
-
)
|
381
|
-
|
382
|
-
if "operation" in block["block_config"]:
|
383
|
-
block["block_config"]["operation"] = OPERATOR_MAP[
|
384
|
-
block["block_config"]["operation"]
|
385
|
-
]
|
386
|
-
|
387
|
-
if "convert_dtype" in block["block_config"]:
|
388
|
-
block["block_config"]["convert_dtype"] = CONVERT_DTYPE_MAP[
|
389
|
-
block["block_config"]["convert_dtype"]
|
390
|
-
]
|
391
|
-
|
392
|
-
# Store the chained blocks and return self
|
393
|
-
self.chained_blocks = flow
|
394
|
-
|
395
|
-
# Validate config files
|
396
|
-
result = self.validate_config_files()
|
397
|
-
if not result.valid:
|
398
|
-
raise ValueError("Invalid config files:\n\n".join(result.errors))
|
399
|
-
|
400
|
-
return self
|
401
|
-
|
402
|
-
def validate_flow(self, dataset: Dataset) -> "ValidationResult":
|
403
|
-
"""
|
404
|
-
Validate that all required dataset columns are present before executing the flow.
|
405
|
-
|
406
|
-
This includes:
|
407
|
-
- Columns referenced in Jinja templates for LLM blocks
|
408
|
-
- Columns required by specific utility blocks (e.g. filter_column, choice_col, etc.)
|
409
|
-
|
410
|
-
Parameters
|
411
|
-
----------
|
412
|
-
dataset : Dataset
|
413
|
-
The input dataset to validate against.
|
414
|
-
|
415
|
-
Returns
|
416
|
-
-------
|
417
|
-
ValidationResult
|
418
|
-
Whether the dataset has all required columns, and which ones are missing.
|
419
|
-
"""
|
420
|
-
errors = []
|
421
|
-
all_columns = set(dataset.column_names)
|
422
|
-
|
423
|
-
for i, block in enumerate(self.chained_blocks or []):
|
424
|
-
name = block["block_config"].get("block_name", f"block_{i}")
|
425
|
-
block_type = block["block_type"]
|
426
|
-
config = block["block_config"]
|
427
|
-
|
428
|
-
# LLM Block: parse Jinja vars
|
429
|
-
cls_name = (
|
430
|
-
block_type.__name__
|
431
|
-
if isinstance(block_type, type)
|
432
|
-
else block_type.__class__.__name__
|
433
|
-
)
|
434
|
-
logger.info(f"Validating block: {name} ({cls_name})")
|
435
|
-
if "LLM" in cls_name:
|
436
|
-
config_path = config.get("config_path")
|
437
|
-
if config_path and os.path.isfile(config_path):
|
438
|
-
with open(config_path, "r", encoding="utf-8") as f:
|
439
|
-
content = f.read()
|
440
|
-
env = Environment()
|
441
|
-
ast = env.parse(content)
|
442
|
-
vars_found = meta.find_undeclared_variables(ast)
|
443
|
-
for var in vars_found:
|
444
|
-
if var not in all_columns:
|
445
|
-
errors.append(
|
446
|
-
f"[{name}] Missing column for prompt var: '{var}'"
|
447
|
-
)
|
448
|
-
|
449
|
-
# FilterByValueBlock
|
450
|
-
if "FilterByValueBlock" in str(block_type):
|
451
|
-
col = config.get("filter_column")
|
452
|
-
if col and col not in all_columns:
|
453
|
-
errors.append(f"[{name}] Missing filter_column: '{col}'")
|
454
|
-
|
455
|
-
# SelectorBlock
|
456
|
-
if "SelectorBlock" in str(block_type):
|
457
|
-
col = config.get("choice_col")
|
458
|
-
if col and col not in all_columns:
|
459
|
-
errors.append(f"[{name}] Missing choice_col: '{col}'")
|
460
|
-
|
461
|
-
choice_map = config.get("choice_map", {})
|
462
|
-
for col in choice_map.values():
|
463
|
-
if col not in all_columns:
|
464
|
-
errors.append(
|
465
|
-
f"[{name}] choice_map references missing column: '{col}'"
|
466
|
-
)
|
467
|
-
|
468
|
-
# CombineColumnsBlock
|
469
|
-
if "CombineColumnsBlock" in str(block_type):
|
470
|
-
cols = config.get("columns", [])
|
471
|
-
for col in cols:
|
472
|
-
if col not in all_columns:
|
473
|
-
errors.append(
|
474
|
-
f"[{name}] CombineColumnsBlock requires column: '{col}'"
|
475
|
-
)
|
476
|
-
|
477
|
-
return ValidationResult(valid=(len(errors) == 0), errors=errors)
|