PyPI - sdg-hub - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

sdg-hub 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

sdg_hub/core/utils/flow_metrics.py ADDED Viewed

@@ -0,0 +1,261 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Flow execution metrics utilities for display and export."""
+# Standard
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Optional
+import json
+import time
+# Third Party
+from datasets import Dataset
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+def aggregate_block_metrics(entries: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Aggregate per-block metrics, coalescing chunked runs.
+    Parameters
+    ----------
+    entries : list[dict[str, Any]]
+        Raw block metrics entries from flow execution.
+    Returns
+    -------
+    list[dict[str, Any]]
+        Aggregated metrics with combined execution times and data changes.
+    """
+    agg: dict[tuple[str, str], dict[str, Any]] = {}
+    for m in entries:
+        key = (m.get("block_name"), m.get("block_type"))
+        a = agg.setdefault(
+            key,
+            {
+                "block_name": key[0],
+                "block_type": key[1],
+                "execution_time": 0.0,
+                "input_rows": 0,
+                "output_rows": 0,
+                "added_cols": set(),
+                "removed_cols": set(),
+                "status": "success",
+                "error_type": None,
+                "error": None,
+            },
+        )
+        a["execution_time"] += float(m.get("execution_time", 0.0))
+        a["input_rows"] += int(m.get("input_rows", 0))
+        a["output_rows"] += int(m.get("output_rows", 0))
+        a["added_cols"].update(m.get("added_cols", []))
+        a["removed_cols"].update(m.get("removed_cols", []))
+        if m.get("status") == "failed":
+            a["status"] = "failed"
+            a["error_type"] = m.get("error_type") or a["error_type"]
+            a["error"] = m.get("error") or a["error"]
+    # normalize
+    result = []
+    for a in agg.values():
+        a["added_cols"] = sorted(a["added_cols"])
+        a["removed_cols"] = sorted(a["removed_cols"])
+        # drop empty error fields
+        if a["status"] == "success":
+            a.pop("error_type", None)
+            a.pop("error", None)
+        result.append(a)
+    return result
+def display_metrics_summary(
+    block_metrics: list[dict[str, Any]],
+    flow_name: str,
+    final_dataset: Optional[Dataset] = None,
+) -> None:
+    """Display a rich table summarizing block execution metrics.
+    Parameters
+    ----------
+    block_metrics : list[dict[str, Any]]
+        Raw block metrics from flow execution.
+    flow_name : str
+        Name of the flow for display title.
+    final_dataset : Optional[Dataset], optional
+        Final dataset from flow execution. None if flow failed.
+    """
+    if not block_metrics:
+        return
+    console = Console()
+    # Create the metrics table
+    table = Table(
+        show_header=True,
+        header_style="bold bright_white",
+        title="Flow Execution Summary",
+    )
+    table.add_column("Block Name", style="bright_cyan", width=20)
+    table.add_column("Type", style="bright_green", width=15)
+    table.add_column("Duration", justify="right", style="bright_yellow", width=10)
+    table.add_column("Rows", justify="center", style="bright_blue", width=12)
+    table.add_column("Columns", justify="center", style="bright_magenta", width=15)
+    table.add_column("Status", justify="center", width=10)
+    total_time = 0.0
+    successful_blocks = 0
+    for metrics in block_metrics:
+        # Format duration
+        duration = f"{metrics['execution_time']:.2f}s"
+        total_time += metrics["execution_time"]
+        # Format row changes
+        if metrics["status"] == "success":
+            row_change = f"{metrics['input_rows']:,} → {metrics['output_rows']:,}"
+            successful_blocks += 1
+        else:
+            row_change = f"{metrics['input_rows']:,} → ❌"
+        # Format column changes
+        added = len(metrics["added_cols"])
+        removed = len(metrics["removed_cols"])
+        if added > 0 and removed > 0:
+            col_change = f"+{added}/-{removed}"
+        elif added > 0:
+            col_change = f"+{added}"
+        elif removed > 0:
+            col_change = f"-{removed}"
+        else:
+            col_change = "—"
+        # Format status with color
+        if metrics["status"] == "success":
+            status = "[green]✓[/green]"
+        else:
+            status = "[red]✗[/red]"
+        table.add_row(
+            metrics["block_name"],
+            metrics["block_type"],
+            duration,
+            row_change,
+            col_change,
+            status,
+        )
+    # Add summary row
+    table.add_section()
+    final_row_count = len(final_dataset) if final_dataset else 0
+    final_col_count = len(final_dataset.column_names) if final_dataset else 0
+    table.add_row(
+        "[bold]TOTAL[/bold]",
+        f"[bold]{len(block_metrics)} blocks[/bold]",
+        f"[bold]{total_time:.2f}s[/bold]",
+        f"[bold]{final_row_count:,} final[/bold]",
+        f"[bold]{final_col_count} final[/bold]",
+        f"[bold][green]{successful_blocks}/{len(block_metrics)}[/green][/bold]",
+    )
+    # Display the table with panel
+    console.print()
+    # Determine panel title and border color based on execution status
+    failed_blocks = len(block_metrics) - successful_blocks
+    if final_dataset is None:
+        # Flow failed completely
+        title = (
+            f"[bold bright_white]{flow_name}[/bold bright_white] - [red]Failed[/red]"
+        )
+        border_style = "bright_red"
+    elif failed_blocks == 0:
+        # All blocks succeeded
+        title = f"[bold bright_white]{flow_name}[/bold bright_white] - [green]Complete[/green]"
+        border_style = "bright_green"
+    else:
+        # Some blocks failed but flow completed
+        title = f"[bold bright_white]{flow_name}[/bold bright_white] - [yellow]Partial[/yellow]"
+        border_style = "bright_yellow"
+    console.print(
+        Panel(
+            table,
+            title=title,
+            border_style=border_style,
+        )
+    )
+    console.print()
+def save_metrics_to_json(
+    block_metrics: list[dict[str, Any]],
+    flow_name: str,
+    flow_version: str,
+    execution_successful: bool,
+    run_start_time: float,
+    log_dir: str,
+    timestamp: Optional[str] = None,
+    flow_name_normalized: Optional[str] = None,
+    logger=None,
+) -> None:
+    """Save flow execution metrics to JSON file.
+    Parameters
+    ----------
+    block_metrics : list[dict[str, Any]]
+        Raw block metrics from flow execution.
+    flow_name : str
+        Human-readable flow name.
+    flow_version : str
+        Flow version string.
+    execution_successful : bool
+        Whether the flow execution completed successfully.
+    run_start_time : float
+        Start time from time.perf_counter() for wall time calculation.
+    log_dir : str
+        Directory to save metrics JSON file.
+    timestamp : Optional[str], optional
+        Timestamp string for filename. Generated if not provided.
+    flow_name_normalized : Optional[str], optional
+        Normalized flow name for filename. Generated if not provided.
+    logger : Optional[logging.Logger], optional
+        Logger instance for status messages.
+    """
+    try:
+        # Generate timestamp and normalized name if not provided
+        if timestamp is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        if flow_name_normalized is None:
+            flow_name_normalized = flow_name.replace(" ", "_").lower()
+        # Aggregate metrics per block (coalesce chunked runs)
+        aggregated = aggregate_block_metrics(block_metrics)
+        metrics_data = {
+            "flow_name": flow_name,
+            "flow_version": flow_version,
+            "execution_timestamp": timestamp,
+            "execution_successful": execution_successful,
+            "total_execution_time": sum(m["execution_time"] for m in aggregated),
+            "total_wall_time": time.perf_counter() - run_start_time,  # end-to-end
+            "total_blocks": len(aggregated),
+            "successful_blocks": sum(1 for m in aggregated if m["status"] == "success"),
+            "failed_blocks": sum(1 for m in aggregated if m["status"] == "failed"),
+            "block_metrics": aggregated,
+        }
+        metrics_filename = f"{flow_name_normalized}_{timestamp}_metrics.json"
+        metrics_path = Path(log_dir) / metrics_filename
+        metrics_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(metrics_path, "w", encoding="utf-8") as f:
+            json.dump(metrics_data, f, indent=2, sort_keys=True)
+        if logger:
+            logger.info(f"Metrics saved to: {metrics_path}")
+    except Exception as e:
+        # Metrics saving failed, warn but do not break flow
+        if logger:
+            logger.warning(f"Failed to save metrics: {e}")

sdg_hub/core/utils/logger_config.py CHANGED Viewed

@@ -7,14 +7,55 @@ import os
 from rich.logging import RichHandler
-def setup_logger(name):
-    # Set up the logger
-    log_level = os.getenv("LOG_LEVEL", "INFO")
-    logging.basicConfig(
-        level=log_level,
-        format="%(message)s",
-        datefmt="[%X]",
-        handlers=[RichHandler()],
-    )
+def setup_logger(name, log_dir=None, log_filename="sdg_hub.log"):
+    """
+    Set up a logger with optional file logging.
+    Parameters
+    ----------
+    name : str
+        Logger name.
+    log_dir : str, optional
+        Directory to save log files. If None, logs are not saved to file.
+    log_filename : str, optional
+        Name of the log file (default: "sdg_hub.log").
+    Returns
+    -------
+    logging.Logger
+        Configured logger.
+    """
+    log_level = os.getenv("LOG_LEVEL", "INFO").upper()
     logger = logging.getLogger(name)
+    logger.setLevel(log_level)
+    # Suppress litellm logs to reduce noise
+    litellm_log_level = os.getenv("LITELLM_LOG_LEVEL", "WARNING").upper()
+    logging.getLogger("litellm").setLevel(litellm_log_level)
+    logging.getLogger("litellm.proxy").setLevel(litellm_log_level)
+    logging.getLogger("litellm.router").setLevel(litellm_log_level)
+    # Prevent duplicate handlers if setup_logger is called multiple times
+    if not logger.handlers:
+        # Rich console handler
+        rich_handler = RichHandler()
+        rich_handler.setLevel(log_level)
+        formatter = logging.Formatter("%(message)s", datefmt="[%X]")
+        rich_handler.setFormatter(formatter)
+        logger.addHandler(rich_handler)
+        # Optional file handler
+        if log_dir is not None:
+            os.makedirs(log_dir, exist_ok=True)
+            file_path = os.path.join(log_dir, log_filename)
+            file_handler = logging.FileHandler(file_path, encoding="utf-8")
+            file_handler.setLevel(log_level)
+            file_formatter = logging.Formatter(
+                "%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+                datefmt="[%Y-%m-%d %H:%M:%S]",
+            )
+            file_handler.setFormatter(file_formatter)
+            logger.addHandler(file_handler)
+    # logger.info(f"Logger setup complete: {name}")
     return logger

sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/__init__.py ADDED Viewed

File without changes

sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/__init__.py ADDED Viewed

File without changes

sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/detailed_summary.yaml ADDED Viewed

@@ -0,0 +1,11 @@
+- role: system
+  content: You are an expert at summarizing text.
+- role: user
+  content: |
+    Summarize the given document in a Abstract Conceptual Layer representation such that it captures overarching themes, main arguments, and core principles.
+    Make sure to include all the details from the document in the summary.
+    Document:
+    {{document_outline}}
+    {{document}}

sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/detailed_summary/flow.yaml ADDED Viewed

@@ -0,0 +1,159 @@
+metadata:
+  name: Detailed Summary Knowledge Tuning Dataset Generation Flow
+  description: Generates high level summaries of the document focusing on overarching themes, main arguments, and core principles. This is then converted into Question-Answer pairs.
+  version: 2.0.0
+  author: SDG Hub Contributors
+  recommended_models:
+    default: openai/gpt-oss-120b
+    compatible:
+    - meta-llama/Llama-3.3-70B-Instruct
+    - microsoft/phi-4
+    - mistralai/Mixtral-8x7B-Instruct-v0.1
+    experimental: []
+  tags:
+  - knowledge-tuning
+  - document-internalization
+  - question-generation
+  - qa-pairs
+  - detailed-summaries
+  license: Apache-2.0
+  min_sdg_hub_version: 0.2.0
+  dataset_requirements:
+    required_columns:
+    - document
+    - document_outline
+    - domain
+    - icl_document
+    - icl_query_1
+    - icl_query_2
+    - icl_query_3
+    description: 'Input dataset should contain documents with text content and domain classification. Each document should be substantial enough for meaningful question generation (minimum 100 words recommended). The flow generates three types
+      of summaries: detailed (n=20), extractive (n=10), and key facts (n=50), each producing corresponding QA pairs designed to help LLMs internalize document knowledge for knowledge tuning.'
+  output_columns:
+  - summary
+  - question
+  - response
+  - raw_document
+  - faithfulness_explanation
+  - faithfulness_judgment
+  id: mild-thunder-748
+blocks:
+- block_type: DuplicateColumnsBlock
+  block_config:
+    block_name: duplicate_document_col
+    input_cols:
+      document: base_document
+- block_type: PromptBuilderBlock
+  block_config:
+    block_name: detailed_summary_prompt
+    input_cols:
+    - document
+    - document_outline
+    output_cols: summary_prompt
+    prompt_config_path: detailed_summary.yaml
+    format_as_messages: true
+- block_type: LLMChatBlock
+  block_config:
+    block_name: gen_detailed_summary
+    input_cols: summary_prompt
+    output_cols: raw_summary
+    max_tokens: 4096
+    temperature: 0.7
+    n: 50
+    async_mode: true
+- block_type: TextParserBlock
+  block_config:
+    block_name: parse_detailed_summary
+    input_cols: raw_summary
+    output_cols: summary
+    start_tags:
+    - ''
+    end_tags:
+    - ''
+- block_type: RenameColumnsBlock
+  block_config:
+    block_name: rename_to_document_column
+    input_cols:
+      document: raw_document
+      summary: document
+- block_type: PromptBuilderBlock
+  block_config:
+    block_name: question_generation_prompt
+    input_cols:
+    - domain
+    - document
+    - document_outline
+    - icl_document
+    - icl_query_1
+    - icl_query_2
+    - icl_query_3
+    output_cols: question_generation_prompt
+    prompt_config_path: ../generate_question_list.yaml
+    format_as_messages: true
+- block_type: LLMChatBlock
+  block_config:
+    block_name: question_generation
+    input_cols: question_generation_prompt
+    output_cols: question_list
+    max_tokens: 256
+    temperature: 0.7
+    n: 1
+    async_mode: true
+- block_type: TextParserBlock
+  block_config:
+    block_name: parse_question_list
+    input_cols: question_list
+    output_cols: question
+    start_tags:
+    - '[QUESTION]'
+    end_tags:
+    - '[END]'
+- block_type: PromptBuilderBlock
+  block_config:
+    block_name: answer_generation_prompt
+    input_cols:
+    - question
+    - document
+    - document_outline
+    output_cols: answer_generation_prompt
+    prompt_config_path: ../generate_answers.yaml
+    format_as_messages: true
+- block_type: LLMChatBlock
+  block_config:
+    block_name: answer_generation
+    input_cols: answer_generation_prompt
+    output_cols: response_dict
+    max_tokens: 4096
+    temperature: 0.7
+    n: 1
+    async_mode: true
+- block_type: TextParserBlock
+  block_config:
+    block_name: parse_response_dict
+    input_cols: response_dict
+    output_cols: response
+    start_tags:
+    - ''
+    end_tags:
+    - ''
+    save_reasoning_content: true
+- block_type: EvaluateFaithfulnessBlock
+  block_config:
+    block_name: eval_faithfulness
+    input_cols:
+    - document
+    - response
+    output_cols:
+    - faithfulness_explanation
+    - faithfulness_judgment
+    prompt_config_path: ../../multi_summary_qa/instructlab/evaluate_faithfulness.yaml
+    filter_value: 'YES'
+    operation: eq
+    async_mode: true
+    format_as_messages: true
+    start_tags:
+    - '[Start of Explanation]'
+    - '[Start of Answer]'
+    end_tags:
+    - '[End of Explanation]'
+    - '[End of Answer]'

sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/__init__.py ADDED Viewed

File without changes

sdg_hub/flows/qa_generation/document_grounded_qa/enhanced_multi_summary_qa/extractive_summary/extractive_summary.yaml ADDED Viewed

@@ -0,0 +1,65 @@
+- role: system
+  content: You are an expert at summarizing text.
+- role: user
+  content: |
+    You will create an Enhanced Extractive Summary from the provided document.
+    Unlike a standard extractive summary that simply pulls key sentences, an Enhanced Extractive Summary adds rich contextual information and cognitive classification to each extracted segment.
+    For each significant section of the document, extract 2-4 key passages, and list them. Then for each extract annotate. Structure your response as follows:
+    ### Extracts from the Passage:
+    [List of extracts]
+    ### Annotations:
+    ### Extract [Number]
+    > "[Direct quote from the original text]"
+    **Context Marker**: [Brief description of where this extract fits within the document's narrative or argument structure]
+    **Relevance**: [Rate as Low, Medium, Medium-High, High, or Very High and briefly explain importance to main themes]
+    **Relationship**: [Explain how this extract connects to other extracts, by specifying extract number, or concepts in the document]
+    To help you understand the task, here is an example:
+    Document:
+    Remote work has grown by over 150% since 2020 due to the pandemic. Companies found that productivity remained stable, while employee satisfaction increased. However, challenges like communication gaps and team cohesion issues emerged. Firms are now adopting hybrid models to balance flexibility with collaboration.
+    ### Extracts from the Passage:
+    1. > "Remote work has grown by over 150% since 2020 due to the pandemic."
+    2. > "Companies found that productivity remained stable, while employee satisfaction increased."
+    3. > "However, challenges like communication gaps and team cohesion issues emerged."
+    4. > "Firms are now adopting hybrid models to balance flexibility with collaboration."
+    ### Annotations:
+    ### Extract 1
+    > "Remote work has grown by over 150% since 2020 due to the pandemic."
+    * **Context Marker**: This is the opening factual statement, providing temporal context and the catalyst for the changes discussed later.
+    * **Relevance**: **Very High** – It introduces the main subject and quantifies the scale of the transformation, anchoring the entire discussion.
+    * **Relationship**: Establishes the cause for changes in work patterns; leads directly to the evaluations in Extracts 2 and 3, and the resulting shift in Extract 4.
+    ### Extract 2
+    > "Companies found that productivity remained stable, while employee satisfaction increased."
+    * **Context Marker**: Positioned after the growth in remote work, this extract summarizes key benefits observed by firms.
+    * **Relevance**: **High** – Highlights why remote work gained support: it delivered business continuity and improved employee morale.
+    * **Relationship**: Works in tandem with Extract 3 to present a balanced view of remote work’s outcomes; explains part of the motivation behind hybrid models in Extract 4.
+    ### Extract 3
+    > "However, challenges like communication gaps and team cohesion issues emerged."
+    * **Context Marker**: Marks a turning point in the narrative, shifting from benefits to complications of remote work.
+    * **Relevance**: **High** – Adds nuance by introducing critical downsides that companies faced.
+    * **Relationship**: Contrasts with Extract 2 and sets up the rationale for the hybrid solution in Extract 4.
+    ### Extract 4
+    > "Firms are now adopting hybrid models to balance flexibility with collaboration."
+    * **Context Marker**: Concluding insight, presenting the emerging consensus or strategy being adopted in response to earlier findings.
+    * **Relevance**: **Very High** – Synthesizes the document’s insights into a forward-looking solution.
+    * **Relationship**: Resolves the tension highlighted in Extracts 2 and 3; represents the evolution sparked by the situation in Extract 1.
+    Now it's your turn to create an Enhanced Extractive Summary from the provided document.
+    Document:
+    {{document_outline}}
+    {{document}}

sdg-hub 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

sdg-hub 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl