PyPI - sdg-hub - Versions diffs - 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

sdg-hub 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/flow.yaml ADDED Viewed

@@ -0,0 +1,161 @@
+metadata:
+  name: Extractive Summary Knowledge Tuning Dataset Generation Flow
+  description: Generate extractive summary from the input document. Each document is first converted into list of knowledge segments for creating extractive summary and then annotated with context, relationship and relevance. This is then converted
+    into Question-Answer pairs.
+  version: 2.0.0
+  author: SDG Hub Contributors
+  recommended_models:
+    default: openai/gpt-oss-120b
+    compatible:
+    - meta-llama/Llama-3.3-70B-Instruct
+    - microsoft/phi-4
+    - mistralai/Mixtral-8x7B-Instruct-v0.1
+    experimental: []
+  tags:
+  - knowledge-tuning
+  - document-internalization
+  - question-generation
+  - knowledge-extractive-summary
+  - qa-pairs
+  - extractive-summaries
+  license: Apache-2.0
+  min_sdg_hub_version: 0.2.0
+  dataset_requirements:
+    required_columns:
+    - document
+    - document_outline
+    - domain
+    - icl_document
+    - icl_query_1
+    - icl_query_2
+    - icl_query_3
+    description: 'Input dataset should contain documents with text content and domain classification. Each document should be substantial enough for meaningful question generation (minimum 100 words recommended). The flow generates three types
+      of summaries: detailed (n=20), extractive (n=10), and key facts (n=50), each producing corresponding QA pairs designed to help LLMs internalize document knowledge for knowledge tuning.'
+  output_columns:
+  - summary
+  - question
+  - response
+  - raw_document
+  - faithfulness_explanation
+  - faithfulness_judgment
+  id: epic-jade-656
+blocks:
+- block_type: DuplicateColumnsBlock
+  block_config:
+    block_name: duplicate_document_col
+    input_cols:
+      document: base_document
+- block_type: PromptBuilderBlock
+  block_config:
+    block_name: extractive_summary_prompt
+    input_cols:
+    - document
+    - document_outline
+    output_cols: extractive_summary_prompt
+    prompt_config_path: extractive_summary.yaml
+    format_as_messages: true
+- block_type: LLMChatBlock
+  block_config:
+    block_name: gen_extractive_summary
+    input_cols: extractive_summary_prompt
+    output_cols: raw_summary
+    max_tokens: 4096
+    temperature: 0.7
+    n: 50
+    async_mode: true
+- block_type: TextParserBlock
+  block_config:
+    block_name: parse_extractive_summary
+    input_cols: raw_summary
+    output_cols: summary
+    start_tags:
+    - ''
+    end_tags:
+    - ''
+- block_type: RenameColumnsBlock
+  block_config:
+    block_name: rename_to_document_column
+    input_cols:
+      document: raw_document
+      summary: document
+- block_type: PromptBuilderBlock
+  block_config:
+    block_name: question_generation_prompt
+    input_cols:
+    - domain
+    - document
+    - document_outline
+    - icl_document
+    - icl_query_1
+    - icl_query_2
+    - icl_query_3
+    output_cols: question_generation_prompt
+    prompt_config_path: ../generate_question_list.yaml
+    format_as_messages: true
+- block_type: LLMChatBlock
+  block_config:
+    block_name: question_generation
+    input_cols: question_generation_prompt
+    output_cols: question_list
+    max_tokens: 256
+    temperature: 0.7
+    n: 1
+    async_mode: true
+- block_type: TextParserBlock
+  block_config:
+    block_name: parse_question_list
+    input_cols: question_list
+    output_cols: question
+    start_tags:
+    - '[QUESTION]'
+    end_tags:
+    - '[END]'
+- block_type: PromptBuilderBlock
+  block_config:
+    block_name: answer_generation_prompt
+    input_cols:
+    - question
+    - document
+    - document_outline
+    output_cols: answer_generation_prompt
+    prompt_config_path: ../generate_answers.yaml
+    format_as_messages: true
+- block_type: LLMChatBlock
+  block_config:
+    block_name: answer_generation
+    input_cols: answer_generation_prompt
+    output_cols: response_dict
+    max_tokens: 4096
+    temperature: 0.7
+    n: 1
+    async_mode: true
+- block_type: TextParserBlock
+  block_config:
+    block_name: parse_response_dict
+    input_cols: response_dict
+    output_cols: response
+    start_tags:
+    - ''
+    end_tags:
+    - ''
+    save_reasoning_content: true
+- block_type: EvaluateFaithfulnessBlock
+  block_config:
+    block_name: eval_faithfulness
+    input_cols:
+    - document
+    - response
+    output_cols:
+    - faithfulness_explanation
+    - faithfulness_judgment
+    prompt_config_path: ../../multi_summary_qa/instructlab/evaluate_faithfulness.yaml
+    filter_value: 'YES'
+    operation: eq
+    async_mode: true
+    format_as_messages: true
+    start_tags:
+    - '[Start of Explanation]'
+    - '[Start of Answer]'
+    end_tags:
+    - '[End of Explanation]'
+    - '[End of Answer]'

sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_answers.yaml ADDED Viewed

@@ -0,0 +1,15 @@
+- role: system
+  content: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
+- role: user
+  content: |
+    Answer the question based on the provided document.
+    Here is the document:
+    Document:
+    {{document_outline}}
+    {{document}}
+    Question:
+    {{question}}

sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_multiple_qa.yaml ADDED Viewed

@@ -0,0 +1,21 @@
+- role: system
+  content: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
+- role: user
+  content: |
+    Given below key fact taken from document generate 5 Question and Answer pair based on the key facts.
+    Introduce variation in the question and key fact.
+    Make sure to ground the question and answer in the provided key fact.
+    Strictly follow this format for each question and answer pair your generate while responding:
+    [QUESTION]
+    <Insert question here>
+    [END]
+    [ANSWER]
+    <Insert answer here>
+    [END]
+    Now, here is the key fact:
+    [Key Fact]
+    {{document_outline}}
+    {{key_fact}}

sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/generate_question_list.yaml ADDED Viewed

@@ -0,0 +1,44 @@
+- role: system
+  content: You are a very knowledgeable AI Assistant that will faithfully assist the user with their task.
+- role: user
+  content: |
+    Develop a series of educational questions from a chapter in a {{domain}} textbook.
+    The questions should:
+    * Self-contained – understandable without needing to reference tables, figures, or specific text sections.
+    * Focus on the provided example and follow the format and style of the provided examples.
+    * Relevant to the subject – based on the textbook’s domain (e.g., legal, scientific, etc.).
+    * Independently answerable – avoid direct references to theorems, figures, or text numbers.
+    * Varied in difficulty - Make difficult same as the provided examples.
+    * Use same format as the provided examples.
+    Strictly follow this format for each question your generate while responding
+    [QUESTION]
+    <Insert question here>
+    [END]
+    Each question and answer pair should stand alone as a mini-lesson, encapsulating a key concept or idea from the chapter in a way that is accessible and informative without requiring the reader to refer back to the textbook.
+    Here are some examples of questions:
+    [Document]
+    {{icl_document}}
+    [QUESTION]
+    {{icl_query_1}}
+    [END]
+    [QUESTION]
+    {{icl_query_2}}
+    [END]
+    [QUESTION]
+    {{icl_query_3}}
+    [END]
+    Now, here is the document:
+    [DOCUMENT]
+    {{document_outline}}
+    {{document}}

sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/__init__.py ADDED Viewed

File without changes

sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/flow.yaml ADDED Viewed

@@ -0,0 +1,104 @@
+metadata:
+  name: Key Facts Knowledge Tuning Dataset Generation Flow
+  description: Generating list of atomic facts from a document and converting each atomic fact into a QA pair. This flow will generate 5 QA pairs for each atomic fact.
+  version: 2.0.0
+  author: SDG Hub Contributors
+  recommended_models:
+    default: openai/gpt-oss-120b
+    compatible:
+    - meta-llama/Llama-3.3-70B-Instruct
+    - microsoft/phi-4
+    - mistralai/Mixtral-8x7B-Instruct-v0.1
+    experimental: []
+  tags:
+  - knowledge-tuning
+  - document-internalization
+  - question-generation
+  - qa-pairs
+  - key-facts
+  license: Apache-2.0
+  min_sdg_hub_version: 0.2.0
+  dataset_requirements:
+    required_columns:
+    - document
+    - document_outline
+    - domain
+    description: 'Input dataset should contain documents with text content and domain classification. Each document should be substantial enough for meaningful question generation (around maximum of 8000 tokens). The flow generates 5 QA pairs for each atomic fact.'
+  output_columns:
+  - key_fact
+  - question
+  - response
+  - raw_key_fact_qa
+  id: heavy-heart-77
+blocks:
+- block_type: PromptBuilderBlock
+  block_config:
+    block_name: atomic_facts_prompt
+    input_cols:
+    - document
+    - document_outline
+    - domain
+    output_cols: atomic_facts_prompt
+    prompt_config_path: key_facts_summary.yaml
+    format_as_messages: true
+- block_type: LLMChatBlock
+  block_config:
+    block_name: gen_atomic_facts
+    input_cols: atomic_facts_prompt
+    output_cols: raw_summary
+    max_tokens: 4096
+    temperature: 0.7
+    n: 1
+    async_mode: true
+- block_type: TextParserBlock
+  block_config:
+    block_name: parse_atomic_facts
+    input_cols: raw_summary
+    output_cols: atomic_facts
+    start_tags:
+    - '### Key Facts With Context'
+    end_tags:
+    - ''
+- block_type: TextParserBlock
+  block_config:
+    block_name: parse_atomic_facts_to_individual_facts
+    input_cols: atomic_facts
+    output_cols: key_fact
+    parsing_pattern: '(?:^|\n)\s*\d+\.\s+(.*?)(?=\n\s*\d+\.\s+|\Z)'
+- block_type: RenameColumnsBlock
+  block_config:
+    block_name: rename_to_document_column
+    input_cols:
+      document: raw_document
+      atomic_facts: document
+- block_type: PromptBuilderBlock
+  block_config:
+    block_name: key_fact_qa
+    input_cols:
+    - key_fact
+    - document_outline
+    output_cols: key_fact_qa
+    prompt_config_path: ../generate_multiple_qa.yaml
+    format_as_messages: true
+- block_type: LLMChatBlock
+  block_config:
+    block_name: generate_key_fact_qa
+    input_cols: key_fact_qa
+    output_cols: raw_key_fact_qa
+    max_tokens: 4096
+    temperature: 0.7
+    n: 1
+    async_mode: true
+- block_type: TextParserBlock
+  block_config:
+    block_name: parse_key_fact_qa
+    input_cols: raw_key_fact_qa
+    output_cols:
+    - question
+    - response
+    start_tags:
+    - '[QUESTION]'
+    - '[ANSWER]'
+    end_tags:
+    - '[END]'
+    - '[END]'

sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/key_facts/key_facts_summary.yaml ADDED Viewed

@@ -0,0 +1,61 @@
+- role: system
+  content: You are an expert at summarizing key facts from text.
+- role: user
+  content: |
+    Please break down the following snippet from an article about {{domain}} into atomic facts.
+    Follow these principles to summarize the key facts:
+    1. Identify atomic facts/key facts from the text.
+    2. Break down compound sentences into atomic facts.
+    3. First list the key facts.
+    4. Then, provide each fact with enough context from the passage so that a reader can clearly understand how the fact connects to the original text.
+    5. Follow the format of the examples below.
+    To help you understand the task, here is an example:
+    ### Passage
+    Remote work has grown by over 150% since 2020 due to the pandemic. Companies found that productivity remained stable, while employee satisfaction increased. However, challenges like communication gaps and team cohesion issues emerged. Firms are now adopting hybrid models to balance flexibility with collaboration.
+    ### Key Facts
+    1. Remote work has grown by over 150% since 2020.
+    2. The pandemic was the driving force behind this growth.
+    3. Companies reported that productivity remained stable during remote work.
+    4. Employee satisfaction increased during the remote work period.
+    5. Remote work created communication challenges.
+    6. Remote work weakened team cohesion.
+    7. Companies are adopting hybrid models.
+    8. Hybrid models aim to balance flexibility with collaboration.
+    ### Key Facts With Context
+    1. **Remote work has grown by over 150% since 2020.**
+      → This fact quantifies the dramatic rise in remote work, establishing a clear before-and-after comparison post-pandemic.
+    2. **The pandemic was the driving force behind this growth.**
+      → It situates the shift in a global crisis context, emphasizing the reactive nature of the workplace transformation.
+    3. **Companies reported that productivity remained stable during remote work.**
+      → Despite initial concerns, businesses observed that remote work did not negatively impact output, reinforcing its viability.
+    4. **Employee satisfaction increased during the remote work period.**
+      → Morale and well-being improved under remote arrangements, adding a human-centric benefit to the operational model.
+    5. **Remote work created communication challenges.**
+      → Acknowledges that the new model introduced friction in interpersonal and organizational dialogue.
+    6. **Remote work weakened team cohesion.**
+      → Beyond communication, the shift led to reduced team bonding and synergy, impacting culture and collaboration.
+    7. **Companies are adopting hybrid models.**
+      → Businesses are no longer sticking with full remote or full office—they are evolving toward a blended solution.
+    8. **Hybrid models aim to balance flexibility with collaboration.**
+      → The rationale behind hybrid models is to retain the positives (flexibility, satisfaction) while mitigating the negatives (isolation, miscommunication).
+    ### End
+    Now it's your turn breakdown following snippet from article about {{domain}} into atomic facts following similar style as above examples
+    ### Passage
+    {{document_outline}}
+    {{document}}
+    ### Key Facts

sdg_hub/flows/text_analysis/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # SPDX-License-Identifier: Apache-2.0
2	+ """Text analysis flows for processing and extracting insights from textual content."""

sdg_hub/flows/text_analysis/structured_insights/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Structured Text Insights Extraction Flow.
+This module provides a comprehensive flow for extracting structured insights from text,
+including summary, keywords, named entities, and sentiment analysis, combined into a JSON output.
+"""

sdg_hub/flows/text_analysis/structured_insights/analyze_sentiment.yaml ADDED Viewed

@@ -0,0 +1,27 @@
+- role: system
+  content: You are an AI assistant expert at analyzing the emotional tone and sentiment of text content.
+- role: user
+  content: |
+    Analyze the overall sentiment and emotional tone of the following text. Consider:
+    1. **Emotional tone**: Is the text positive, negative, or neutral?
+    2. **Intensity**: How strong is the sentiment expressed?
+    3. **Context**: Consider the subject matter and how it's presented
+    4. **Balance**: If there are mixed sentiments, which one dominates?
+    Sentiment categories:
+    - **positive**: Optimistic, encouraging, favorable, upbeat content
+    - **negative**: Critical, pessimistic, unfavorable, concerning content
+    - **neutral**: Factual, balanced, objective content without strong emotional tone
+    Text to analyze:
+    {{text}}
+    Provide your response in the following format:
+    [SENTIMENT]
+    positive
+    [/SENTIMENT]
+    Where the sentiment value is one of: positive, negative, or neutral

sdg_hub/flows/text_analysis/structured_insights/extract_entities.yaml ADDED Viewed

@@ -0,0 +1,38 @@
+- role: system
+  content: You are an AI assistant expert at identifying and extracting named entities from text content. You must return valid JSON format.
+- role: user
+  content: |
+    Extract all important named entities from the following text and organize them by category:
+    Text to analyze:
+    {{text}}
+    Identify and categorize entities into:
+    - **people**: Names of individuals, titles, roles
+    - **organizations**: Companies, institutions, agencies, groups
+    - **locations**: Cities, countries, regions, landmarks, addresses
+    Rules:
+    - Only include entities explicitly mentioned in the text
+    - Use exact names as they appear
+    - Focus on the most important entities (3-8 per category max)
+    Provide your response in exactly this format:
+    [ENTITIES]
+    {
+      "people": ["Person 1", "Person 2"],
+      "organizations": ["Org 1", "Org 2"],
+      "locations": ["Location 1"]
+    }
+    [/ENTITIES]
+    If no entities are found for a category, use an empty list: []
+    If no entities are found at all, respond with:
+    [ENTITIES]
+    {
+      "people": [],
+      "organizations": [],
+      "locations": []
+    }
+    [/ENTITIES]

sdg_hub/flows/text_analysis/structured_insights/extract_keywords.yaml ADDED Viewed

@@ -0,0 +1,21 @@
+- role: system
+  content: You are an AI assistant expert at identifying the most important keywords and phrases from text content.
+- role: user
+  content: |
+    Extract exactly 10 of the most important keywords or key phrases from the following text. These should be:
+    1. The most relevant and representative terms
+    2. Words or short phrases (1-3 words) that capture the main topics
+    3. Terms that someone would use to search for or categorize this content
+    4. A mix of specific terms and broader concepts when appropriate
+    5. Avoid common stop words unless they're part of important phrases
+    Text to analyze:
+    {{text}}
+    Provide your response in the following format:
+    [KEYWORDS]
+    keyword1, keyword2, keyword3, keyword4, keyword5, keyword6, keyword7, keyword8, keyword9, keyword10
+    [/KEYWORDS]

sdg_hub/flows/text_analysis/structured_insights/flow.yaml ADDED Viewed

@@ -0,0 +1,153 @@
+metadata:
+  id: green-clay-812
+  name: "Structured Text Insights Extraction Flow"
+  description: >-
+    Multi-step pipeline for extracting structured insights from text including
+    summary, keywords, entities, and sentiment analysis combined into a JSON output
+  version: "1.0.0"
+  author: "SDG Hub Contributors"
+  recommended_models:
+    default: "openai/gpt-oss-120b"
+    compatible:
+      - "meta-llama/Llama-3.3-70B-Instruct"
+      - "microsoft/phi-4"
+      - "mistralai/Mixtral-8x7B-Instruct-v0.1"
+    experimental:
+      - "gpt-4o"
+  tags:
+    - "text-analysis"
+    - "summarization"
+    - "nlp"
+    - "structured-output"
+    - "insights"
+    - "sentiment-analysis"
+    - "entity-extraction"
+    - "keyword-extraction"
+  license: "Apache-2.0"
+  min_sdg_hub_version: "0.2.0"
+  dataset_requirements:
+    required_columns:
+      - "text"
+    description: >-
+      Input dataset should contain text content for analysis. Each text should be
+      substantial enough for meaningful analysis (minimum 50 words recommended).
+      Works well with news articles, blog posts, reviews, and other content.
+blocks:
+  # Extract Summary
+  - block_type: "PromptBuilderBlock"
+    block_config:
+      block_name: "build_summary_prompt"
+      input_cols:
+        - "text"
+      output_cols: "summary_prompt"
+      prompt_config_path: "summarize.yaml"
+  - block_type: "LLMChatBlock"
+    block_config:
+      block_name: "generate_summary"
+      input_cols: "summary_prompt"
+      output_cols: "raw_summary"
+      max_tokens: 1024
+      temperature: 0.3
+      async_mode: true
+  - block_type: "TextParserBlock"
+    block_config:
+      block_name: "parse_summary"
+      input_cols: "raw_summary"
+      output_cols: "summary"
+      start_tags:
+        - "[SUMMARY]"
+      end_tags:
+        - "[/SUMMARY]"
+  # Extract Keywords
+  - block_type: "PromptBuilderBlock"
+    block_config:
+      block_name: "build_keywords_prompt"
+      input_cols:
+        - "text"
+      output_cols: "keywords_prompt"
+      prompt_config_path: "extract_keywords.yaml"
+  - block_type: "LLMChatBlock"
+    block_config:
+      block_name: "generate_keywords"
+      input_cols: "keywords_prompt"
+      output_cols: "raw_keywords"
+      max_tokens: 512
+      temperature: 0.3
+      async_mode: true
+  - block_type: "TextParserBlock"
+    block_config:
+      block_name: "parse_keywords"
+      input_cols: "raw_keywords"
+      output_cols: "keywords"
+      start_tags:
+        - "[KEYWORDS]"
+      end_tags:
+        - "[/KEYWORDS]"
+  # Extract Entities
+  - block_type: "PromptBuilderBlock"
+    block_config:
+      block_name: "build_entities_prompt"
+      input_cols:
+        - "text"
+      output_cols: "entities_prompt"
+      prompt_config_path: "extract_entities.yaml"
+  - block_type: "LLMChatBlock"
+    block_config:
+      block_name: "generate_entities"
+      input_cols: "entities_prompt"
+      output_cols: "raw_entities"
+      max_tokens: 1024
+      temperature: 0.3
+      async_mode: true
+  - block_type: "TextParserBlock"
+    block_config:
+      block_name: "parse_entities"
+      input_cols: "raw_entities"
+      output_cols: "entities"
+      start_tags:
+        - "[ENTITIES]"
+      end_tags:
+        - "[/ENTITIES]"
+  # Extract Sentiment
+  - block_type: "PromptBuilderBlock"
+    block_config:
+      block_name: "build_sentiment_prompt"
+      input_cols:
+        - "text"
+      output_cols: "sentiment_prompt"
+      prompt_config_path: "analyze_sentiment.yaml"
+  - block_type: "LLMChatBlock"
+    block_config:
+      block_name: "generate_sentiment"
+      input_cols: "sentiment_prompt"
+      output_cols: "raw_sentiment"
+      max_tokens: 256
+      temperature: 0.1
+      async_mode: true
+  - block_type: "TextParserBlock"
+    block_config:
+      block_name: "parse_sentiment"
+      input_cols: "raw_sentiment"
+      output_cols: "sentiment"
+      start_tags:
+        - "[SENTIMENT]"
+      end_tags:
+        - "[/SENTIMENT]"
+  # Create Structured Insights
+  - block_type: "JSONStructureBlock"
+    block_config:
+      block_name: "create_structured_insights"
+      input_cols:
+        - "summary"
+        - "keywords"
+        - "entities"
+        - "sentiment"
+      output_cols:
+        - "structured_insights"
+      ensure_json_serializable: true

sdg_hub/flows/text_analysis/structured_insights/summarize.yaml ADDED Viewed

@@ -0,0 +1,21 @@
+- role: system
+  content: You are an AI assistant expert at creating concise, informative summaries that capture the most important information from text.
+- role: user
+  content: |
+    Create a brief, concise summary of the following text. Your summary should:
+    1. Be 2-3 sentences maximum
+    2. Capture the most important information and key points
+    3. Be clear and easy to understand
+    4. Avoid adding any information not present in the original text
+    5. Focus on the main topic, key facts, and significant details
+    Text to summarize:
+    {{text}}
+    Provide your response in the following format:
+    [SUMMARY]
+    Your summary here
+    [/SUMMARY]

sdg-hub 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl

sdg-hub 0.2.2py3-none-any.whl → 0.3.0py3-none-any.whl